import os
import warnings
import logging

# configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)

# get warning filter policy from the environment variables
# set to "ignore" for rendering the HTMLs, or to "once" otherwise
WARNING_FILTER_POLICY = os.getenv("WARNING_FILTER_POLICY", "once")
logger.info(f"{WARNING_FILTER_POLICY = }")
warnings.filterwarnings(WARNING_FILTER_POLICY)

21:11:31 [INFO] WARNING_FILTER_POLICY = 'ignore'

import numpy as np
import pandas as pd
import seaborn as sns
import shap

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBClassifier

pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:,.2f}".format

from utils.constants import RANDOM_SEED
from utils.common import (
    get_data_folder_path,
    set_plotting_config,
    plot_boxplot_by_class,
    plot_correlation_matrix,
)
from utils.evals import (
    describe_input_features,
    plot_confusion_matrix,
    plot_target_rate,
    compute_binary_classification_metrics,
    build_coefficients_table,
    plot_coefficients_values,
    plot_coefficients_significance,
    plot_eval_metrics_xgb,
    plot_gain_metric_xgb,
    plot_shap_importance,
    plot_shap_beeswarm,
    build_ks_table,
    beautify_ks_table,
    plot_ks_table,
    plot_roc_curve,
)
from utils.feature_selection import run_feature_selection_steps

# plots configuration
sns.set_style("darkgrid")
sns.set_palette("colorblind")
set_plotting_config()
%matplotlib inline

data_path = get_data_folder_path()

df_input = pd.read_csv(os.path.join(data_path, "fetal_health.csv"))
df_input.columns = [col.replace(" ", "_") for col in df_input.columns]

target_col = "is_normal"
target_classes_dict = {
    0: "Not Normal",
    1: "Normal"
}
test_size = 0.20

# create a new binary target column from the original multi-class target column
df_input[target_col] = (df_input["fetal_health"] == 1).astype(np.int8)
df_input.drop(columns=["fetal_health"], inplace=True)

df_input_train, df_input_test = train_test_split(
    df_input,
    test_size=test_size,
    stratify=df_input[target_col],
    random_state=RANDOM_SEED,
)

pd.concat([
    pd.Series(target_classes_dict, name="label"),
    df_input_train[target_col].value_counts(dropna=False, normalize=False).rename("train_target_count"),
    df_input_train[target_col].value_counts(dropna=False, normalize=True).rename("train_target_pct"),
    df_input_test[target_col].value_counts(dropna=False, normalize=False).rename("test_target_count"),
    df_input_test[target_col].value_counts(dropna=False, normalize=True).rename("test_target_pct"),
], axis=1)

describe_input_features(df_input, df_input_train, df_input_test)

# Standardize training and test data
stdscaler = StandardScaler()

# training data
y_train = df_input_train[target_col]
X_train_all = (
    pd.DataFrame(
        # fit scaler on training data (and then transform training data)
        data=stdscaler.fit_transform(df_input_train),
        columns=df_input_train.columns,
        index=df_input_train.index
    )
    # remove target from the model input features table
    .drop(columns=[target_col])
)

# test data
y_test = df_input_test[target_col]
X_test_all = (
    pd.DataFrame(
        # use scaler fitted on training data to transform test data
        data=stdscaler.transform(df_input_test),
        columns=df_input_test.columns,
        index=df_input_test.index
    )
    # remove target from the model input features table
    .drop(columns=[target_col])
)

display(
    plot_boxplot_by_class(
        df_input=df_input_train,  # use only training data to avoid bias in test results
        class_col=target_col,
        class_mapping=target_classes_dict,
        plots_per_line=6,
        title="Features in input dataset",
    )
)

display(
    plot_correlation_matrix(
        # use only training data to avoid bias in test results
        df=df_input_train, method="pearson", fig_height=10
    )
)

fs_steps = {
    "manual": {
        "cols_to_exclude": [
            "severe_decelerations",
        ]
    },
    "null_variance": None,
    "correlation": {"threshold": 0.8},
    "vif": {"threshold": 2},
    "l1_regularization": {
        "problem_type": "classification",
        "train_test_split_params": {"test_size": test_size},
        "logspace_search": {"start": -5, "stop": 1, "num": 20, "base": 10},
        # tolerance over minimum error with which to search for the best model
        "error_tolerance_pct": 0.05,
        # minimum features to keep in final selection
        "min_feats_to_keep": 4,
        "random_seed": RANDOM_SEED,
    },
}

selected_feats, df_fs = run_feature_selection_steps(
    # use only training data to avoid bias in test results
    X=X_train_all,
    y=y_train,
    fs_steps=fs_steps
)

21:11:45 [INFO] --> Starting Feature Selection with 21 features

21:11:45 [INFO] 1. MANUAL FILTER

21:11:45 [INFO]  - Removing 1 (4.8%) feature(s) manually: ['severe_decelerations']

21:11:45 [INFO] 2. NULL_VARIANCE FILTER

21:11:45 [INFO]  - Removing 0 (0.0%) feature(s) with null variance (var == 0): []

21:11:45 [INFO] 3. CORRELATION FILTER

21:11:45 [INFO]   Running Correlation filter with threshold of 0.8

21:11:45 [INFO]  - Removing feature 'histogram_median' with correlation +0.9523 to 'histogram_mean'

21:11:45 [INFO]  - Removing feature 'histogram_mean' with correlation +0.8937 to 'histogram_mode'

21:11:45 [INFO]  - Removing feature 'histogram_width' with correlation -0.9006 to 'histogram_min'

# build model input datasets
X_train = X_train_all[selected_feats]
X_test = X_test_all[selected_feats]

display(
    plot_correlation_matrix(
        # use only training data to avoid bias in test results
        df=df_input_train[selected_feats + [target_col]], method="pearson", fig_height=5
    )
)

# compute the Variance Inflation Factor (VIF) for each feature
df_vif = pd.DataFrame(
    data=[variance_inflation_factor(X_train.values, i) for i in range(len(selected_feats))],
    index=selected_feats,
    columns=["VIF"]
).sort_values("VIF", ascending=False)

df_vif

MODEL_SELECTION = "logistic_regression"
# MODEL_SELECTION = "xgboost"

model_selection_error = ValueError(
    "'MODEL_SELECTION' must be either 'logistic_regression' or 'xgboost'. "
    f"Got {MODEL_SELECTION} instead."
)

if MODEL_SELECTION == "logistic_regression":
    Estimator = LogisticRegression
    cv_search_space = {
        "penalty": ["l1", "l2", "elasticnet"],
        "C": np.logspace(-3, 1, num=9, base=10.0),
        "class_weight": [None],
    }
elif MODEL_SELECTION == "xgboost":
    Estimator = XGBClassifier
    cv_search_space = {
        "objective": ["binary:logistic"],
        "n_estimators": [30, 40, 50],
        "learning_rate": [0.1],
        "max_depth": [3, 4, 6],
        "min_child_weight": [2, 4],
        "gamma": [0, 0.5],
        "alpha": [0, 0.3],
        "scale_pos_weight": [1],
        "lambda": [1],
        ## "subsample": [0.8, 1.0],
        ## "colsample_bytree": [0.8, 1.0],
        "verbosity": [0],
    }
else:
    raise model_selection_error

cv_scoring_metrics = {
    "roc_auc": "ROC AUC",
    "accuracy": "Accuracy",
    "precision": "Precision",
    "recall": "Recall",
    "f1": "F1 Score",
}
refit_metric = "f1"  # metric to optimize for the final model

%%time
# define evaluation
kfold_cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_SEED)
# define search
grid_search = GridSearchCV(
    estimator=Estimator(),
    param_grid=cv_search_space,
    scoring=list(cv_scoring_metrics.keys()),
    cv=kfold_cv,
    refit=refit_metric,
    verbose=1,
)
# execute search
result_cv = grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits

CPU times: user 220 ms, sys: 15.6 ms, total: 236 ms
Wall time: 284 ms

print("Grid Search CV Best Model - Scoring Metrics:")
for i, (metric_key, metric_name) in enumerate(cv_scoring_metrics.items(), start=1):
    print(
        f"  {str(i) + ".":>2} {metric_name:.<10} "
        f"{result_cv.cv_results_[f"mean_test_{metric_key}"][result_cv.best_index_]:.3f}"
    )
print(f"\nBest Hyperparameters: {result_cv.best_params_}")

Grid Search CV Best Model - Scoring Metrics:
  1. ROC AUC... 0.950
  2. Accuracy.. 0.900
  3. Precision. 0.925
  4. Recall.... 0.949
  5. F1 Score.. 0.937

Best Hyperparameters: {'C': np.float64(1.0), 'class_weight': None, 'penalty': 'l2'}

# instantiate model with best hyperparameters and additional kwargs
if MODEL_SELECTION == "logistic_regression":
    model_kwargs = dict()
    model_fit_kwargs = dict()
elif MODEL_SELECTION == "xgboost":
    eval_metrics = dict(
        logloss="Binary Cross-entropy Loss (Log-loss)",
        error="Binary Classification Error Rate",
        auc="ROC AUC",
    )
    model_kwargs = dict(eval_metric=list(eval_metrics.keys()))
    model_fit_kwargs = dict(
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=False
    )
else:
    raise model_selection_error
    
model = Estimator(**result_cv.best_params_, **model_kwargs, random_state=RANDOM_SEED)

# Fit model and make predictions
model.fit(X_train, y_train, **model_fit_kwargs)
# Make predictions ([:, 1] returns the probability of the positive class)
y_pred_proba_train = pd.Series(data=model.predict_proba(X_train)[:, 1], index=X_train.index)
y_pred_proba = pd.Series(data=model.predict_proba(X_test)[:, 1], index=X_test.index)

if MODEL_SELECTION == "xgboost":
    display(plot_eval_metrics_xgb(model.evals_result(), eval_metrics))

display(plot_target_rate(y_test, y_pred_proba))

# use only training data to get optimal threshold to avoid bias in test results
fig_roc_curve, optimal_thresh = plot_roc_curve(
    y_true=y_train,
    y_pred_proba=y_pred_proba_train,
    title="ROC Curve on Training Data (for finding the Optimal Threshold)",
    return_optimal_thresh=True
)
display(fig_roc_curve)

# compute binary predictions
print(f"Optimal Threshold for Classification: {100*optimal_thresh:.2f}%")
y_pred_train = (y_pred_proba_train > optimal_thresh).astype(int)
y_pred = (y_pred_proba > optimal_thresh).astype(int)

Optimal Threshold for Classification: 77.12%

if MODEL_SELECTION == "logistic_regression":
    df_coefficients = build_coefficients_table(
        coefficients=model.coef_[0],
        intercept=model.intercept_[0],
        X_train=X_train,
        y_pred_train=y_pred_proba_train,
        y_train=y_train,
        problem_type="classification",
    )
    display(
        plot_coefficients_values(df_coefficients),
        plot_coefficients_significance(
            df_coefficients,
            log_scale=bool(df_coefficients["p-values"].max() < 2e-4)
        )
    )
    
elif MODEL_SELECTION == "xgboost":
    # compute SHAP values
    explainer = shap.Explainer(model)
    shap_values = explainer(X_test)
    # shap plots
    display(
        plot_shap_importance(shap_values),
        plot_shap_beeswarm(shap_values),
        plot_gain_metric_xgb(model, X_test),
    )

else:
    raise model_selection_error

df_train_metrics = pd.Series(
    compute_binary_classification_metrics(y_train, y_pred_train, y_pred_proba_train)
).to_frame(name="Train Metrics")
df_test_metrics = pd.Series(
    compute_binary_classification_metrics(y_test, y_pred, y_pred_proba)
).to_frame(name="Test Metrics")

print("Final Model - Scoring Metrics on Train & Test Datasets:")
df_metrics = df_train_metrics.join(df_test_metrics)
display(df_metrics)

Final Model - Scoring Metrics on Train & Test Datasets:

# Confusion Matrix
display(
    plot_confusion_matrix(
        y_test,
        y_pred,
        estimator=model,
        target_classes_dict=target_classes_dict,
        normalize="true",
    )
)

display(plot_roc_curve(y_test, y_pred_proba))

df_ks, ks_score = build_ks_table(y_test, y_pred_proba, return_ks=True)
print(f"KS score: {ks_score * 100:.2f} p.p.")
beautify_ks_table(df_ks)

KS score: 70.65 p.p.

display(plot_ks_table(df_ks))

	data_type	count	min	25%	50%	75%	max	std	mean	mean_train	mean_test	train_test_pct_diff
baseline_value	numeric	2126	106.00	126.00	133.00	140.00	160.00	9.84	133.30	133.23	133.60	0.00
accelerations	numeric	2126	0.00	0.00	0.00	0.01	0.02	0.00	0.00	0.00	0.00	-0.01
fetal_movement	numeric	2126	0.00	0.00	0.00	0.00	0.48	0.05	0.01	0.01	0.01	0.11
uterine_contractions	numeric	2126	0.00	0.00	0.00	0.01	0.01	0.00	0.00	0.00	0.00	0.01
light_decelerations	numeric	2126	0.00	0.00	0.00	0.00	0.01	0.00	0.00	0.00	0.00	-0.05
severe_decelerations	numeric	2126	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	1.99
prolongued_decelerations	numeric	2126	0.00	0.00	0.00	0.00	0.01	0.00	0.00	0.00	0.00	0.03
abnormal_short_term_variability	numeric	2126	12.00	32.00	49.00	61.00	87.00	17.19	46.99	47.06	46.70	-0.01
mean_value_of_short_term_variability	numeric	2126	0.20	0.70	1.20	1.70	7.00	0.88	1.33	1.34	1.30	-0.03
percentage_of_time_with_abnormal_long_term_variability	numeric	2126	0.00	0.00	0.00	11.00	91.00	18.40	9.85	9.72	10.34	0.06
mean_value_of_long_term_variability	numeric	2126	0.00	4.60	7.40	10.80	50.70	5.63	8.19	8.31	7.71	-0.07
histogram_width	numeric	2126	3.00	37.00	67.50	100.00	180.00	38.96	70.45	71.20	67.46	-0.05
histogram_min	numeric	2126	50.00	67.00	93.00	120.00	159.00	29.56	93.58	92.97	96.02	0.03
histogram_max	numeric	2126	122.00	152.00	162.00	174.00	238.00	17.94	164.03	164.16	163.48	-0.00
histogram_number_of_peaks	numeric	2126	0.00	2.00	3.00	6.00	18.00	2.95	4.07	4.11	3.90	-0.05
histogram_number_of_zeroes	numeric	2126	0.00	0.00	0.00	0.00	10.00	0.71	0.32	0.33	0.31	-0.04
histogram_mode	numeric	2126	60.00	129.00	139.00	148.00	187.00	16.38	137.45	137.50	137.25	-0.00
histogram_mean	numeric	2126	73.00	125.00	136.00	145.00	182.00	15.59	134.61	134.61	134.61	-0.00
histogram_median	numeric	2126	77.00	129.00	139.00	148.00	186.00	14.47	138.09	138.05	138.24	0.00
histogram_variance	numeric	2126	0.00	2.00	7.00	24.00	269.00	28.98	18.81	19.45	16.24	-0.16
histogram_tendency	numeric	2126	-1.00	0.00	0.00	1.00	1.00	0.61	0.32	0.32	0.30	-0.07
is_normal	numeric	2126	0.00	1.00	1.00	1.00	1.00	0.42	0.78	0.78	0.78	0.00

	VIF
percentage_of_time_with_abnormal_long_term_variability	1.53
abnormal_short_term_variability	1.33
accelerations	1.22
uterine_contractions	1.12
prolongued_decelerations	1.07

	Train Metrics	Test Metrics
Accuracy	0.87	0.85
Precision	0.97	0.96
Recall	0.86	0.84
F1 Score	0.91	0.90
ROC AUC	0.95	0.94
GINI	0.90	0.88
KS Gain	0.75	0.71

	min score	max score	n positives	n negatives	n all	positive rate	negative rate	cum positives	cum negatives	cumpct positives	cumpct negatives	diff	KS Gain
0	0.00	0.19	3	40	43	6.98%	93.02%	3	40	0.90%	42.55%	41.65 pp
1	0.20	0.43	18	25	43	41.86%	58.14%	21	65	6.33%	69.15%	62.82 pp
2	0.44	0.74	27	15	42	64.29%	35.71%	48	80	14.46%	85.11%	70.65 pp	<--
3	0.74	0.90	35	8	43	81.40%	18.60%	83	88	25.00%	93.62%	68.62 pp
4	0.90	0.97	38	4	42	90.48%	9.52%	121	92	36.45%	97.87%	61.43 pp
5	0.97	0.99	42	1	43	97.67%	2.33%	163	93	49.10%	98.94%	49.84 pp
6	0.99	0.99	41	1	42	97.62%	2.38%	204	94	61.45%	100.00%	38.55 pp
7	0.99	1.00	43	0	43	100.00%	0.00%	247	94	74.40%	100.00%	25.60 pp
8	1.00	1.00	42	0	42	100.00%	0.00%	289	94	87.05%	100.00%	12.95 pp
9	1.00	1.00	43	0	43	100.00%	0.00%	332	94	100.00%	100.00%	0.00 pp

Binary Classification¶

1. Load Data¶

2. Process Data¶

Target column¶

Train test split¶

Scaling (Standardization)¶

3. Exploratory Data Analysis (EDA)¶

Boxplots by Target Class¶

Pearson's Correlation¶

4. Feature Selection¶

Correlation check¶

Multicollinearity check¶

5. Classifier Model¶

Select classifier: Logistic Regression or XGBoost¶

Hyperparameter tuning with K-Fold Cross Validation¶

Final Model¶

Feature Importance¶

Performance Metrics¶

Confusion Matrix¶

ROC AUC¶

KS Gain¶

	label	train_target_count	train_target_pct	test_target_count	test_target_pct
0	Not Normal	377	0.22	94	0.22
1	Normal	1323	0.78	332	0.78