Regression¶
import os
import warnings
import logging
# configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)
# get warning filter policy from the environment variables
# set to "ignore" for rendering the HTMLs, or to "once" otherwise
WARNING_FILTER_POLICY = os.getenv("WARNING_FILTER_POLICY", "once")
logger.info(f"{WARNING_FILTER_POLICY = }")
warnings.filterwarnings(WARNING_FILTER_POLICY)
21:12:52 [INFO] WARNING_FILTER_POLICY = 'ignore'
import numpy as np
import pandas as pd
import seaborn as sns
import shap
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBRegressor
pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:,.2f}".format
from utils.constants import RANDOM_SEED
from utils.common import (
get_data_folder_path,
set_plotting_config,
plot_boxplot_by_class,
plot_correlation_matrix,
)
from utils.evals import (
describe_input_features,
compute_regression_metrics,
build_coefficients_table,
plot_coefficients_values,
plot_coefficients_significance,
plot_eval_metrics_xgb,
plot_gain_metric_xgb,
plot_shap_importance,
plot_shap_beeswarm,
)
from utils.feature_selection import run_feature_selection_steps
# plots configuration
sns.set_style("darkgrid")
sns.set_palette("colorblind")
set_plotting_config()
%matplotlib inline
1. Load Data¶
In this notebook, we will use the Medical Insurance Payout Dataset. This dataset contains historical data for over 1300 insurance customers (age, sex, BMI, number of children, smoking habits, and region) along with their actual medical charges. i.e., the expenditure for the customer (target variable).
Sources:
data_path = get_data_folder_path()
df_input = pd.read_csv(os.path.join(data_path, "expenses.csv"))
# convert categorical columns into numerical
df_input["is_male"] = (df_input["sex"] == "male").astype(np.int8)
df_input["is_smoker"] = (df_input["smoker"] == "yes").astype(np.int8)
df_input = (
pd.concat([
df_input.drop(columns=["sex", "smoker", "region"]),
pd.get_dummies(df_input["region"], prefix="region", dtype=np.int8)
], axis=1)
)
2. Process Data¶
Target column¶
The target column is the medical charges for each customer. We want to build a model capable of predicting medical charges for new customers in order to help the insurance company to determine their pricing strategy.
target_col = "charges"
test_size = 0.20
Train test split¶
df_input_train, df_input_test = train_test_split(
df_input,
test_size=test_size,
random_state=RANDOM_SEED,
)
describe_input_features(df_input, df_input_train, df_input_test)
data_type | count | null_count | min | 25% | 50% | 75% | max | std | mean | mean_train | mean_test | train_test_pct_diff | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
age | numeric | 1338 | 0 | 18.00 | 27.00 | 39.00 | 51.00 | 64.00 | 14.05 | 39.21 | 39.36 | 38.61 | -0.02 |
bmi | numeric | 1338 | 0 | 15.96 | 26.30 | 30.40 | 34.69 | 53.13 | 6.10 | 30.66 | 30.56 | 31.07 | 0.02 |
children | numeric | 1338 | 0 | 0.00 | 0.00 | 1.00 | 2.00 | 5.00 | 1.21 | 1.09 | 1.11 | 1.04 | -0.06 |
charges | numeric | 1338 | 0 | 1,121.87 | 4,740.29 | 9,382.03 | 16,639.91 | 63,770.43 | 12,110.01 | 13,270.42 | 13,346.09 | 12,968.32 | -0.03 |
is_male | numeric | 1338 | 0 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | 0.50 | 0.51 | 0.51 | 0.48 | -0.07 |
is_smoker | numeric | 1338 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | 0.40 | 0.20 | 0.21 | 0.20 | -0.02 |
region_northeast | numeric | 1338 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | 0.43 | 0.24 | 0.25 | 0.21 | -0.15 |
region_northwest | numeric | 1338 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | 0.43 | 0.24 | 0.24 | 0.26 | 0.08 |
region_southeast | numeric | 1338 | 0 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | 0.45 | 0.27 | 0.26 | 0.30 | 0.14 |
region_southwest | numeric | 1338 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | 0.43 | 0.24 | 0.25 | 0.23 | -0.08 |
Scaling (Standardization)¶
# Standardize training and test data
stdscaler = StandardScaler()
# training data
y_train = df_input_train[target_col]
X_train_all = (
pd.DataFrame(
# fit scaler on training data (and then transform training data)
data=stdscaler.fit_transform(df_input_train),
columns=df_input_train.columns,
index=df_input_train.index
)
# remove target from the model input features table
.drop(columns=[target_col])
)
# test data
y_test = df_input_test[target_col]
X_test_all = (
pd.DataFrame(
# use scaler fitted on training data to transform test data
data=stdscaler.transform(df_input_test),
columns=df_input_test.columns,
index=df_input_test.index
)
# remove target from the model input features table
.drop(columns=[target_col])
)
3. Exploratory Data Analysis (EDA)¶
Boxplots by Target Class¶
# use only training data to avoid bias in test results
df_boxplot = df_input_train.copy()
# get target quartiles
df_boxplot["charges_quartiles"] = pd.qcut(
df_boxplot["charges"], q=4, labels=[f"Q{i}" for i in range(1, 5)],
)
display(
plot_boxplot_by_class(
df_input=df_boxplot,
class_col="charges_quartiles",
plots_per_line=4,
title="Features in input dataset by medical charges quartiles",
)
)
Pearson's Correlation¶
display(
plot_correlation_matrix(
# use only training data to avoid bias in test results
df=df_input_train, method="pearson", fig_height=10
)
)
4. Feature Selection¶
fs_steps = {
"manual": {
"cols_to_exclude": []
},
"null_variance": None,
"correlation": {"threshold": 0.8},
"vif": {"threshold": 2},
"l1_regularization": {
"problem_type": "regression",
"train_test_split_params": {"test_size": test_size},
"logspace_search": {"start": -3, "stop": 3, "num": 20, "base": 10},
# tolerance over minimum error with which to search for the best model
"error_tolerance_pct": 0.05,
# minimum features to keep in final selection
"min_feats_to_keep": 4,
"random_seed": RANDOM_SEED,
},
}
selected_feats, df_fs = run_feature_selection_steps(
# use only training data to avoid bias in test results
X=X_train_all,
y=y_train,
fs_steps=fs_steps
)
21:12:56 [INFO] --> Starting Feature Selection with 9 features
21:12:56 [INFO] 1. MANUAL FILTER
21:12:56 [INFO] - Removing 0 (0.0%) feature(s) manually: []
21:12:56 [INFO] 2. NULL_VARIANCE FILTER
21:12:56 [INFO] - Removing 0 (0.0%) feature(s) with null variance (var == 0): []
21:12:56 [INFO] 3. CORRELATION FILTER
21:12:56 [INFO] Running Correlation filter with threshold of 0.8
21:12:56 [INFO] - Removing 0 (0.0%) feature(s) with abs(correlation) > 0.8
21:12:56 [INFO] 4. VIF FILTER
21:12:56 [INFO] Computing the Variance Inflation Factor (VIF) for 9 features...
21:12:56 [INFO] 1. Removing feature: region_northeast . VIF: inf
21:12:56 [INFO] >> Stopping at feat: region_southeast . VIF: 1.61 (threshold: 2)
21:12:56 [INFO] - Removing 1 (11.1%) feature(s) with VIF >= 2
21:12:56 [INFO] 5. L1_REGULARIZATION FILTER
21:12:56 [INFO] - Removing 4 (50.0%) feature(s) with null coefficient after L1 regularization: ['is_male', 'region_northwest', 'region_southeast', 'region_southwest']
21:12:56 [INFO] --> Completed Feature Selection with 4 selected features (44.4% of the original 9 features): ['age', 'bmi', 'children', 'is_smoker']
# build model input datasets
X_train = X_train_all[selected_feats]
X_test = X_test_all[selected_feats]
Correlation check¶
display(
plot_correlation_matrix(
# use only training data to avoid bias in test results
df=df_input_train[selected_feats + [target_col]], method="pearson", fig_height=5
)
)
Multicollinearity check¶
# compute the Variance Inflation Factor (VIF) for each feature
df_vif = pd.DataFrame(
data=[variance_inflation_factor(X_train.values, i) for i in range(len(selected_feats))],
index=selected_feats,
columns=["VIF"]
).sort_values("VIF", ascending=False)
df_vif
VIF | |
---|---|
age | 1.02 |
bmi | 1.01 |
children | 1.00 |
is_smoker | 1.00 |
5. Regression Model¶
Select regressor: Linear Regression or XGBoost¶
MODEL_SELECTION = "linear_regression"
# MODEL_SELECTION = "xgboost"
model_selection_error = ValueError(
"'MODEL_SELECTION' must be either 'linear_regression' or 'xgboost'. "
f"Got {MODEL_SELECTION} instead."
)
Hyperparameter tuning with K-Fold Cross Validation¶
For a detailed explanation of XGBoost's parameters, refer to: https://www.kaggle.com/code/prashant111/a-guide-on-xgboost-hyperparameters-tuning/notebook
if MODEL_SELECTION == "linear_regression":
# ElasticNet is a linear regression model with combined L1 (Lasso)
# and L2 (Ridge) priors as regularizer
Estimator = ElasticNet
cv_search_space = {
"alpha": np.logspace(-4, 1, num=11, base=10.0), # 10e-4 to 10 in 11 steps
"l1_ratio": np.linspace(0,1,9), # 0%, 12.5%, 25%, ... 100%
"fit_intercept": [True],
"max_iter": [2000], # use 2000 instead of defalult 1000
}
elif MODEL_SELECTION == "xgboost":
Estimator = XGBRegressor
cv_search_space = {
"objective": ["reg:squarederror"],
"n_estimators": [20, 35, 50],
"learning_rate": [0.1],
"max_depth": [3, 4, 6],
"min_child_weight": [2, 4],
"gamma": [0, 0.5],
"alpha": [0, 0.3],
"scale_pos_weight": [1],
"lambda": [1],
## "subsample": [0.8, 1.0],
## "colsample_bytree": [0.8, 1.0],
"verbosity": [0],
}
else:
raise model_selection_error
For the full list of scikit-learn's scoring string names, refer to: https://scikit-learn.org/stable/modules/model_evaluation.html#string-name-scorers
cv_scoring_metrics = {
"neg_mean_absolute_error": "Mean Absolute Error",
"neg_median_absolute_error": "Median Absolute Error",
"neg_mean_squared_error": "Mean Squared Error",
"neg_root_mean_squared_error": "Root Mean Squared Error",
"neg_max_error": "Maximum Residual Error",
"r2": "R-squared (Coefficient of Determination)",
}
refit_metric = "neg_root_mean_squared_error" # metric to optimize for the final model
%%time
# define evaluation
kfold_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_SEED)
# define search
grid_search = GridSearchCV(
estimator=Estimator(),
param_grid=cv_search_space,
scoring=list(cv_scoring_metrics.keys()),
cv=kfold_cv,
refit=refit_metric,
verbose=1,
)
# execute search
result_cv = grid_search.fit(X_train, y_train)
Fitting 3 folds for each of 99 candidates, totalling 297 fits
CPU times: user 768 ms, sys: 4.15 ms, total: 772 ms Wall time: 773 ms
print("Grid Search CV Best Model - Scoring Metrics:")
for i, (metric_key, metric_name) in enumerate(cv_scoring_metrics.items(), start=1):
print(
f" {str(i) + ".":>2} {metric_name:.<42} "
f"{abs(result_cv.cv_results_[f"mean_test_{metric_key}"][result_cv.best_index_]):,.3f}"
)
print(f"\nBest Hyperparameters: {result_cv.best_params_}")
Grid Search CV Best Model - Scoring Metrics: 1. Mean Absolute Error....................... 4,224.524 2. Median Absolute Error..................... 2,523.168 3. Mean Squared Error........................ 37,706,718.713 4. Root Mean Squared Error................... 6,128.426 5. Maximum Residual Error.................... 26,727.877 6. R-squared (Coefficient of Determination).. 0.738 Best Hyperparameters: {'alpha': np.float64(0.0001), 'fit_intercept': True, 'l1_ratio': np.float64(0.0), 'max_iter': 2000}
Final Model¶
# instantiate model with best hyperparameters and additional kwargs
if MODEL_SELECTION == "linear_regression":
model_kwargs = dict()
model_fit_kwargs = dict()
elif MODEL_SELECTION == "xgboost":
eval_metrics = dict(
rmse="Root Mean Squared Error",
mae="Mean Absolute Error",
mape="Mean Absolute Percentage Error",
)
model_kwargs = dict(eval_metric=list(eval_metrics.keys()))
model_fit_kwargs = dict(
eval_set=[(X_train, y_train), (X_test, y_test)],
verbose=False
)
else:
raise model_selection_error
model = Estimator(**result_cv.best_params_, **model_kwargs, random_state=RANDOM_SEED)
# Fit model and make predictions
model.fit(X_train, y_train, **model_fit_kwargs)
# Make predictions
y_pred_train = pd.Series(data=model.predict(X_train), index=X_train.index)
y_pred = pd.Series(data=model.predict(X_test), index=X_test.index)
if MODEL_SELECTION == "xgboost":
display(plot_eval_metrics_xgb(model.evals_result(), eval_metrics))
Feature Importance¶
- For Linear Regression: coefficients values and statistical significance
- For XGBoost: SHAP analysis and Gain Metric
if MODEL_SELECTION == "linear_regression":
df_coefficients = build_coefficients_table(
coefficients=model.coef_,
intercept=model.intercept_,
X_train=X_train,
y_pred_train=y_pred_train,
y_train=y_train,
problem_type="regression",
)
display(
plot_coefficients_values(df_coefficients),
plot_coefficients_significance(
df_coefficients,
log_scale=bool(df_coefficients["p-values"].max() < 2e-4)
)
)
elif MODEL_SELECTION == "xgboost":
# compute SHAP values
explainer = shap.Explainer(model)
shap_values = explainer(X_test)
# shap plots
display(
plot_shap_importance(shap_values),
plot_shap_beeswarm(shap_values),
plot_gain_metric_xgb(model, X_test),
)
else:
raise model_selection_error
Performance Metrics¶
df_train_metrics = pd.Series(
compute_regression_metrics(y_train, y_pred_train)
).to_frame(name="Train Metrics")
df_test_metrics = pd.Series(
compute_regression_metrics(y_test, y_pred)
).to_frame(name="Test Metrics")
print("Final Model - Scoring Metrics on Train & Test Datasets:")
df_metrics = df_train_metrics.join(df_test_metrics)
display(df_metrics)
Final Model - Scoring Metrics on Train & Test Datasets:
Train Metrics | Test Metrics | |
---|---|---|
Mean Absolute Error | 4,210.78 | 4,213.97 |
Median Absolute Error | 2,482.14 | 2,752.37 |
Mean Squared Error | 37,369,599.55 | 33,982,267.56 |
Root Mean Squared Error | 6,113.07 | 5,829.43 |
Maximum Residual Error | 29,547.01 | 22,827.35 |
R-squared (Coefficient of Determination) | 0.74 | 0.78 |