Binary Classification¶

In [1]:
import os
import warnings
import logging

# configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)

# get warning filter policy from the environment variables
# set to "ignore" for rendering the HTMLs, or to "once" otherwise
WARNING_FILTER_POLICY = os.getenv("WARNING_FILTER_POLICY", "once")
logger.info(f"{WARNING_FILTER_POLICY = }")
warnings.filterwarnings(WARNING_FILTER_POLICY)
21:11:31 [INFO] WARNING_FILTER_POLICY = 'ignore'
In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import shap

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBClassifier

pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:,.2f}".format
In [3]:
from utils.constants import RANDOM_SEED
from utils.common import (
    get_data_folder_path,
    set_plotting_config,
    plot_boxplot_by_class,
    plot_correlation_matrix,
)
from utils.evals import (
    describe_input_features,
    plot_confusion_matrix,
    plot_target_rate,
    compute_binary_classification_metrics,
    build_coefficients_table,
    plot_coefficients_values,
    plot_coefficients_significance,
    plot_eval_metrics_xgb,
    plot_gain_metric_xgb,
    plot_shap_importance,
    plot_shap_beeswarm,
    build_ks_table,
    beautify_ks_table,
    plot_ks_table,
    plot_roc_curve,
)
from utils.feature_selection import run_feature_selection_steps
In [4]:
# plots configuration
sns.set_style("darkgrid")
sns.set_palette("colorblind")
set_plotting_config()
%matplotlib inline

1. Load Data¶

In this notebook, we will use the Fetal Health Dataset. This dataset comprises 2126 records of features from Cardiotocogram exams, classified by experts into Normal, Suspect, and Pathological to assess fetal health and help reduce child and maternal mortality.

Sources:

  1. Kaggle: https://www.kaggle.com/datasets/andrewmvd/fetal-health-classification
  2. Original article: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC68223152
In [5]:
data_path = get_data_folder_path()

df_input = pd.read_csv(os.path.join(data_path, "fetal_health.csv"))
df_input.columns = [col.replace(" ", "_") for col in df_input.columns]

2. Process Data¶

Target column¶

Fetal health (target column) can have the following values:

  • 1: Normal
  • 2: Suspect
  • 3: Pathological

For this notebook, we will consider the Normal/not Normal distinction for binary classification

In [6]:
target_col = "is_normal"
target_classes_dict = {
    0: "Not Normal",
    1: "Normal"
}
test_size = 0.20
In [7]:
# create a new binary target column from the original multi-class target column
df_input[target_col] = (df_input["fetal_health"] == 1).astype(np.int8)
df_input.drop(columns=["fetal_health"], inplace=True)

Train test split¶

In [8]:
df_input_train, df_input_test = train_test_split(
    df_input,
    test_size=test_size,
    stratify=df_input[target_col],
    random_state=RANDOM_SEED,
)
In [9]:
pd.concat([
    pd.Series(target_classes_dict, name="label"),
    df_input_train[target_col].value_counts(dropna=False, normalize=False).rename("train_target_count"),
    df_input_train[target_col].value_counts(dropna=False, normalize=True).rename("train_target_pct"),
    df_input_test[target_col].value_counts(dropna=False, normalize=False).rename("test_target_count"),
    df_input_test[target_col].value_counts(dropna=False, normalize=True).rename("test_target_pct"),
], axis=1)
Out[9]:
label train_target_count train_target_pct test_target_count test_target_pct
0 Not Normal 377 0.22 94 0.22
1 Normal 1323 0.78 332 0.78
In [10]:
describe_input_features(df_input, df_input_train, df_input_test)
Out[10]:
data_type count null_count min 25% 50% 75% max std mean mean_train mean_test train_test_pct_diff
baseline_value numeric 2126 0 106.00 126.00 133.00 140.00 160.00 9.84 133.30 133.23 133.60 0.00
accelerations numeric 2126 0 0.00 0.00 0.00 0.01 0.02 0.00 0.00 0.00 0.00 -0.01
fetal_movement numeric 2126 0 0.00 0.00 0.00 0.00 0.48 0.05 0.01 0.01 0.01 0.11
uterine_contractions numeric 2126 0 0.00 0.00 0.00 0.01 0.01 0.00 0.00 0.00 0.00 0.01
light_decelerations numeric 2126 0 0.00 0.00 0.00 0.00 0.01 0.00 0.00 0.00 0.00 -0.05
severe_decelerations numeric 2126 0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.99
prolongued_decelerations numeric 2126 0 0.00 0.00 0.00 0.00 0.01 0.00 0.00 0.00 0.00 0.03
abnormal_short_term_variability numeric 2126 0 12.00 32.00 49.00 61.00 87.00 17.19 46.99 47.06 46.70 -0.01
mean_value_of_short_term_variability numeric 2126 0 0.20 0.70 1.20 1.70 7.00 0.88 1.33 1.34 1.30 -0.03
percentage_of_time_with_abnormal_long_term_variability numeric 2126 0 0.00 0.00 0.00 11.00 91.00 18.40 9.85 9.72 10.34 0.06
mean_value_of_long_term_variability numeric 2126 0 0.00 4.60 7.40 10.80 50.70 5.63 8.19 8.31 7.71 -0.07
histogram_width numeric 2126 0 3.00 37.00 67.50 100.00 180.00 38.96 70.45 71.20 67.46 -0.05
histogram_min numeric 2126 0 50.00 67.00 93.00 120.00 159.00 29.56 93.58 92.97 96.02 0.03
histogram_max numeric 2126 0 122.00 152.00 162.00 174.00 238.00 17.94 164.03 164.16 163.48 -0.00
histogram_number_of_peaks numeric 2126 0 0.00 2.00 3.00 6.00 18.00 2.95 4.07 4.11 3.90 -0.05
histogram_number_of_zeroes numeric 2126 0 0.00 0.00 0.00 0.00 10.00 0.71 0.32 0.33 0.31 -0.04
histogram_mode numeric 2126 0 60.00 129.00 139.00 148.00 187.00 16.38 137.45 137.50 137.25 -0.00
histogram_mean numeric 2126 0 73.00 125.00 136.00 145.00 182.00 15.59 134.61 134.61 134.61 -0.00
histogram_median numeric 2126 0 77.00 129.00 139.00 148.00 186.00 14.47 138.09 138.05 138.24 0.00
histogram_variance numeric 2126 0 0.00 2.00 7.00 24.00 269.00 28.98 18.81 19.45 16.24 -0.16
histogram_tendency numeric 2126 0 -1.00 0.00 0.00 1.00 1.00 0.61 0.32 0.32 0.30 -0.07
is_normal numeric 2126 0 0.00 1.00 1.00 1.00 1.00 0.42 0.78 0.78 0.78 0.00

Scaling (Standardization)¶

In [11]:
# Standardize training and test data
stdscaler = StandardScaler()

# training data
y_train = df_input_train[target_col]
X_train_all = (
    pd.DataFrame(
        # fit scaler on training data (and then transform training data)
        data=stdscaler.fit_transform(df_input_train),
        columns=df_input_train.columns,
        index=df_input_train.index
    )
    # remove target from the model input features table
    .drop(columns=[target_col])
)

# test data
y_test = df_input_test[target_col]
X_test_all = (
    pd.DataFrame(
        # use scaler fitted on training data to transform test data
        data=stdscaler.transform(df_input_test),
        columns=df_input_test.columns,
        index=df_input_test.index
    )
    # remove target from the model input features table
    .drop(columns=[target_col])
)

3. Exploratory Data Analysis (EDA)¶

Boxplots by Target Class¶

In [12]:
display(
    plot_boxplot_by_class(
        df_input=df_input_train,  # use only training data to avoid bias in test results
        class_col=target_col,
        class_mapping=target_classes_dict,
        plots_per_line=6,
        title="Features in input dataset",
    )
)
No description has been provided for this image

Pearson's Correlation¶

In [13]:
display(
    plot_correlation_matrix(
        # use only training data to avoid bias in test results
        df=df_input_train, method="pearson", fig_height=10
    )
)
No description has been provided for this image

4. Feature Selection¶

In [14]:
fs_steps = {
    "manual": {
        "cols_to_exclude": [
            "severe_decelerations",
        ]
    },
    "null_variance": None,
    "correlation": {"threshold": 0.8},
    "vif": {"threshold": 2},
    "l1_regularization": {
        "problem_type": "classification",
        "train_test_split_params": {"test_size": test_size},
        "logspace_search": {"start": -5, "stop": 1, "num": 20, "base": 10},
        # tolerance over minimum error with which to search for the best model
        "error_tolerance_pct": 0.05,
        # minimum features to keep in final selection
        "min_feats_to_keep": 4,
        "random_seed": RANDOM_SEED,
    },
}
In [15]:
selected_feats, df_fs = run_feature_selection_steps(
    # use only training data to avoid bias in test results
    X=X_train_all,
    y=y_train,
    fs_steps=fs_steps
)
21:11:45 [INFO] --> Starting Feature Selection with 21 features
21:11:45 [INFO] 1. MANUAL FILTER
21:11:45 [INFO]  - Removing 1 (4.8%) feature(s) manually: ['severe_decelerations']
21:11:45 [INFO] 2. NULL_VARIANCE FILTER
21:11:45 [INFO]  - Removing 0 (0.0%) feature(s) with null variance (var == 0): []
21:11:45 [INFO] 3. CORRELATION FILTER
21:11:45 [INFO]   Running Correlation filter with threshold of 0.8
21:11:45 [INFO]  - Removing feature 'histogram_median' with correlation +0.9523 to 'histogram_mean'
21:11:45 [INFO]  - Removing feature 'histogram_mean' with correlation +0.8937 to 'histogram_mode'
21:11:45 [INFO]  - Removing feature 'histogram_width' with correlation -0.9006 to 'histogram_min'
21:11:45 [INFO]  - Removing 3 (15.0%) feature(s) with abs(correlation) > 0.8
21:11:45 [INFO] 4. VIF FILTER
21:11:45 [INFO] Computing the Variance Inflation Factor (VIF) for 17 features...
21:11:45 [INFO]   1. Removing feature: histogram_min .......................................... VIF: 4.77
21:11:45 [INFO]   2. Removing feature: histogram_mode ......................................... VIF: 4.14
21:11:45 [INFO]   3. Removing feature: histogram_max .......................................... VIF: 3.18
21:11:45 [INFO]   4. Removing feature: light_decelerations .................................... VIF: 2.57
21:11:45 [INFO]   5. Removing feature: mean_value_of_short_term_variability ................... VIF: 2.11
21:11:45 [INFO]   >> Stopping at feat: histogram_variance ..................................... VIF: 1.80  (threshold: 2)
21:11:45 [INFO]  - Removing 5 (29.4%) feature(s) with VIF >= 2
21:11:45 [INFO] 5. L1_REGULARIZATION FILTER
21:11:45 [INFO]  - Removing 7 (58.3%) feature(s) with null coefficient after L1 regularization: ['baseline_value', 'fetal_movement', 'mean_value_of_long_term_variability', 'histogram_number_of_peaks', 'histogram_number_of_zeroes', 'histogram_variance', 'histogram_tendency']
21:11:45 [INFO] --> Completed Feature Selection with 5 selected features (23.8% of the original 21 features): ['accelerations', 'uterine_contractions', 'prolongued_decelerations', 'abnormal_short_term_variability', 'percentage_of_time_with_abnormal_long_term_variability']
In [16]:
# build model input datasets
X_train = X_train_all[selected_feats]
X_test = X_test_all[selected_feats]

Correlation check¶

In [17]:
display(
    plot_correlation_matrix(
        # use only training data to avoid bias in test results
        df=df_input_train[selected_feats + [target_col]], method="pearson", fig_height=5
    )
)
No description has been provided for this image

Multicollinearity check¶

In [18]:
# compute the Variance Inflation Factor (VIF) for each feature
df_vif = pd.DataFrame(
    data=[variance_inflation_factor(X_train.values, i) for i in range(len(selected_feats))],
    index=selected_feats,
    columns=["VIF"]
).sort_values("VIF", ascending=False)

df_vif
Out[18]:
VIF
percentage_of_time_with_abnormal_long_term_variability 1.53
abnormal_short_term_variability 1.33
accelerations 1.22
uterine_contractions 1.12
prolongued_decelerations 1.07

5. Classifier Model¶

Select classifier: Logistic Regression or XGBoost¶

In [19]:
MODEL_SELECTION = "logistic_regression"
# MODEL_SELECTION = "xgboost"

model_selection_error = ValueError(
    "'MODEL_SELECTION' must be either 'logistic_regression' or 'xgboost'. "
    f"Got {MODEL_SELECTION} instead."
)

Hyperparameter tuning with K-Fold Cross Validation¶

  • Logistic Regression: In binary classification with imbalanced classes, avoid setting class_weight="balanced" if you want to use the model's predicted probabilities as proxies for the real probability distributions of the target classes, that is, if you want to interpret the predicted probability as "the actual probability that the sample belongs to the class". In this case, you should not use 50% as the threshold for the binary classification; you should find the optimal threshold using the ROC Curve (detailed below) to maximize the model's performance.

  • XGBoost: For a detailed explanation of XGBoost's parameters, refer to: https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning/notebook

In [20]:
if MODEL_SELECTION == "logistic_regression":
    Estimator = LogisticRegression
    cv_search_space = {
        "penalty": ["l1", "l2", "elasticnet"],
        "C": np.logspace(-3, 1, num=9, base=10.0),
        "class_weight": [None],
    }
elif MODEL_SELECTION == "xgboost":
    Estimator = XGBClassifier
    cv_search_space = {
        "objective": ["binary:logistic"],
        "n_estimators": [30, 40, 50],
        "learning_rate": [0.1],
        "max_depth": [3, 4, 6],
        "min_child_weight": [2, 4],
        "gamma": [0, 0.5],
        "alpha": [0, 0.3],
        "scale_pos_weight": [1],
        "lambda": [1],
        ## "subsample": [0.8, 1.0],
        ## "colsample_bytree": [0.8, 1.0],
        "verbosity": [0],
    }
else:
    raise model_selection_error

For the full list of scikit-learn's scoring string names, refer to: https://scikit-learn.org/stable/modules/model_evaluation.html#string-name-scorers

In [21]:
cv_scoring_metrics = {
    "roc_auc": "ROC AUC",
    "accuracy": "Accuracy",
    "precision": "Precision",
    "recall": "Recall",
    "f1": "F1 Score",
}
refit_metric = "f1"  # metric to optimize for the final model
In [22]:
%%time
# define evaluation
kfold_cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_SEED)
# define search
grid_search = GridSearchCV(
    estimator=Estimator(),
    param_grid=cv_search_space,
    scoring=list(cv_scoring_metrics.keys()),
    cv=kfold_cv,
    refit=refit_metric,
    verbose=1,
)
# execute search
result_cv = grid_search.fit(X_train, y_train)
Fitting 3 folds for each of 27 candidates, totalling 81 fits
CPU times: user 220 ms, sys: 15.6 ms, total: 236 ms
Wall time: 284 ms
In [23]:
print("Grid Search CV Best Model - Scoring Metrics:")
for i, (metric_key, metric_name) in enumerate(cv_scoring_metrics.items(), start=1):
    print(
        f"  {str(i) + ".":>2} {metric_name:.<10} "
        f"{result_cv.cv_results_[f"mean_test_{metric_key}"][result_cv.best_index_]:.3f}"
    )
print(f"\nBest Hyperparameters: {result_cv.best_params_}")
Grid Search CV Best Model - Scoring Metrics:
  1. ROC AUC... 0.950
  2. Accuracy.. 0.900
  3. Precision. 0.925
  4. Recall.... 0.949
  5. F1 Score.. 0.937

Best Hyperparameters: {'C': np.float64(1.0), 'class_weight': None, 'penalty': 'l2'}

Final Model¶

In [24]:
# instantiate model with best hyperparameters and additional kwargs
if MODEL_SELECTION == "logistic_regression":
    model_kwargs = dict()
    model_fit_kwargs = dict()
elif MODEL_SELECTION == "xgboost":
    eval_metrics = dict(
        logloss="Binary Cross-entropy Loss (Log-loss)",
        error="Binary Classification Error Rate",
        auc="ROC AUC",
    )
    model_kwargs = dict(eval_metric=list(eval_metrics.keys()))
    model_fit_kwargs = dict(
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=False
    )
else:
    raise model_selection_error
    
model = Estimator(**result_cv.best_params_, **model_kwargs, random_state=RANDOM_SEED)
In [25]:
# Fit model and make predictions
model.fit(X_train, y_train, **model_fit_kwargs)
# Make predictions ([:, 1] returns the probability of the positive class)
y_pred_proba_train = pd.Series(data=model.predict_proba(X_train)[:, 1], index=X_train.index)
y_pred_proba = pd.Series(data=model.predict_proba(X_test)[:, 1], index=X_test.index)
In [26]:
if MODEL_SELECTION == "xgboost":
    display(plot_eval_metrics_xgb(model.evals_result(), eval_metrics))

Plot target rate per group of predicted probability

A good model should have increasing target rate for each group of predicted probability (e.g. quartiles, deciles)

In [27]:
display(plot_target_rate(y_test, y_pred_proba))
No description has been provided for this image

Define optimal threshold for separating classes using the ROC Curve

The optimal threshold is the one that maximizes the difference between the True Positive Rate (TPR) and False Positive Rate (FPR).

In [28]:
# use only training data to get optimal threshold to avoid bias in test results
fig_roc_curve, optimal_thresh = plot_roc_curve(
    y_true=y_train,
    y_pred_proba=y_pred_proba_train,
    title="ROC Curve on Training Data (for finding the Optimal Threshold)",
    return_optimal_thresh=True
)
display(fig_roc_curve)
No description has been provided for this image
In [29]:
# compute binary predictions
print(f"Optimal Threshold for Classification: {100*optimal_thresh:.2f}%")
y_pred_train = (y_pred_proba_train > optimal_thresh).astype(int)
y_pred = (y_pred_proba > optimal_thresh).astype(int)
Optimal Threshold for Classification: 77.12%

Feature Importance¶

  • For Logistic Regression: coefficients values and statistical significance
  • For XGBoost: SHAP analysis and Gain Metric
In [30]:
if MODEL_SELECTION == "logistic_regression":
    df_coefficients = build_coefficients_table(
        coefficients=model.coef_[0],
        intercept=model.intercept_[0],
        X_train=X_train,
        y_pred_train=y_pred_proba_train,
        y_train=y_train,
        problem_type="classification",
    )
    display(
        plot_coefficients_values(df_coefficients),
        plot_coefficients_significance(
            df_coefficients,
            log_scale=bool(df_coefficients["p-values"].max() < 2e-4)
        )
    )
    
elif MODEL_SELECTION == "xgboost":
    # compute SHAP values
    explainer = shap.Explainer(model)
    shap_values = explainer(X_test)
    # shap plots
    display(
        plot_shap_importance(shap_values),
        plot_shap_beeswarm(shap_values),
        plot_gain_metric_xgb(model, X_test),
    )

else:
    raise model_selection_error
No description has been provided for this image
No description has been provided for this image

Performance Metrics¶

In [31]:
df_train_metrics = pd.Series(
    compute_binary_classification_metrics(y_train, y_pred_train, y_pred_proba_train)
).to_frame(name="Train Metrics")
df_test_metrics = pd.Series(
    compute_binary_classification_metrics(y_test, y_pred, y_pred_proba)
).to_frame(name="Test Metrics")

print("Final Model - Scoring Metrics on Train & Test Datasets:")
df_metrics = df_train_metrics.join(df_test_metrics)
display(df_metrics)
Final Model - Scoring Metrics on Train & Test Datasets:
Train Metrics Test Metrics
Accuracy 0.87 0.85
Precision 0.97 0.96
Recall 0.86 0.84
F1 Score 0.91 0.90
ROC AUC 0.95 0.94
GINI 0.90 0.88
KS Gain 0.75 0.71

Confusion Matrix¶

In [32]:
# Confusion Matrix
display(
    plot_confusion_matrix(
        y_test,
        y_pred,
        estimator=model,
        target_classes_dict=target_classes_dict,
        normalize="true",
    )
)
No description has been provided for this image

ROC AUC¶

In [33]:
display(plot_roc_curve(y_test, y_pred_proba))
No description has been provided for this image

KS Gain¶

In [34]:
df_ks, ks_score = build_ks_table(y_test, y_pred_proba, return_ks=True)
print(f"KS score: {ks_score * 100:.2f} p.p.")
beautify_ks_table(df_ks)
KS score: 70.65 p.p.
Out[34]:
min score max score n positives n negatives n all positive rate negative rate cum positives cum negatives cumpct positives cumpct negatives diff KS Gain
0 0.00 0.19 3 40 43 6.98% 93.02% 3 40 0.90% 42.55% 41.65 pp
1 0.20 0.43 18 25 43 41.86% 58.14% 21 65 6.33% 69.15% 62.82 pp
2 0.44 0.74 27 15 42 64.29% 35.71% 48 80 14.46% 85.11% 70.65 pp <--
3 0.74 0.90 35 8 43 81.40% 18.60% 83 88 25.00% 93.62% 68.62 pp
4 0.90 0.97 38 4 42 90.48% 9.52% 121 92 36.45% 97.87% 61.43 pp
5 0.97 0.99 42 1 43 97.67% 2.33% 163 93 49.10% 98.94% 49.84 pp
6 0.99 0.99 41 1 42 97.62% 2.38% 204 94 61.45% 100.00% 38.55 pp
7 0.99 1.00 43 0 43 100.00% 0.00% 247 94 74.40% 100.00% 25.60 pp
8 1.00 1.00 42 0 42 100.00% 0.00% 289 94 87.05% 100.00% 12.95 pp
9 1.00 1.00 43 0 43 100.00% 0.00% 332 94 100.00% 100.00% 0.00 pp
In [35]:
display(plot_ks_table(df_ks))
No description has been provided for this image