Clustering¶

In [1]:
import os
import warnings
import logging

# configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)

# get warning filter policy from the environment variables
# set to "ignore" for rendering the HTMLs, or to "once" otherwise
WARNING_FILTER_POLICY = os.getenv("WARNING_FILTER_POLICY", "once")
logger.info(f"{WARNING_FILTER_POLICY = }")
warnings.filterwarnings(WARNING_FILTER_POLICY)
21:12:36 [INFO] WARNING_FILTER_POLICY = 'ignore'
In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

from kneed import KneeLocator
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:,.2f}".format
In [3]:
from utils.constants import RANDOM_SEED
from utils.common import get_data_folder_path, set_plotting_config, plot_boxplot_by_class
from utils.clustering import search_kmeans, plot_kmeans_search
In [4]:
# plots configuration
sns.set_style("darkgrid")
sns.set_palette("colorblind")
set_plotting_config()
%matplotlib inline

1. Preprocessing¶

Load data¶

In this notebook, we will use the Medical Insurance Payout Dataset. This dataset contains historical data for over 1300 insurance customers (age, sex, BMI, number of children, smoking habits, and region) along with their actual medical charges. i.e., the expenditure for the customer.

Sources:

  1. Kaggle: https://www.kaggle.com/datasets/harshsingh2209/medical-insurance-payout
  2. Original source: https://raw.githubusercontent.com/JovianML/opendatasets/master/data/medical-charges.csv
In [5]:
data_path = get_data_folder_path()

df_input = pd.read_csv(os.path.join(data_path, 'expenses.csv'))
In [6]:
# convert categorical columns into numerical
df_input["is_male"] = (df_input["sex"] == "male").astype(np.int8)
df_input["is_smoker"] = (df_input["smoker"] == "yes").astype(np.int8)
df_input = (
    pd.concat([
        df_input.drop(columns=["sex", "smoker", "region"]),
        pd.get_dummies(df_input["region"], prefix="region", dtype=np.int8)
    ], axis=1)
)
In [7]:
# define columns for clustering
cluster_cols = [
    col for col in df_input.columns
    # remove the target column to simulate an unsupervised problem
    if col != "charges"
    # remove one-hot-encoded region columns to simplify the clustering process
    and not col.startswith("region_")
]
df_cl = df_input[cluster_cols]

Scale data (if necessary)¶

If all features used for clustering have the same range (e.g. scores form 0 to 100) or the same unit (e.g. distances), there is no need to standardize the data.

In [8]:
# Standardize X_train and X_test
stdscaler = StandardScaler()
df_cl_std = pd.DataFrame(stdscaler.fit_transform(df_cl), columns=df_cl.columns, index=df_cl.index)

2. K-means Clustering¶

Find best number of clusters¶

In [9]:
df_kmeans = search_kmeans(df_cl_std, max_n_clusters=15)

Elbow Method implementation:

  • Kneedle algorithm original paper: https://www1.icsi.berkeley.edu/~barath/papers/kneedle-simplex11.pdf
  • kneed python package: https://github.com/arvkevi/kneed
In [10]:
# determine the ideal number of cluster using the "Elbow Method"
# using the kneed package which implements the Kneedle algorithm
kl = KneeLocator(
    x=df_kmeans["n_clusters"].values,
    y=df_kmeans["wcss"].values,
    curve="convex",
    direction="decreasing"
)
print(f'Elbow Method: best number of clusters is {kl.elbow}')
Elbow Method: best number of clusters is 7
In [11]:
display(plot_kmeans_search(df_kmeans=df_kmeans, elbow=kl.elbow))
No description has been provided for this image

Fit final model¶

In [12]:
# fit K-means with selected number of clusters
kmeans_model = KMeans(n_clusters=kl.elbow, verbose=0, random_state=RANDOM_SEED)
kmeans_model.fit(df_cl_std)
Out[12]:
KMeans(n_clusters=np.int64(7), random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=np.int64(7), random_state=42)
In [13]:
s_clusters = pd.Series(data=kmeans_model.labels_, name="cluster", index=df_cl_std.index)
s_clusters += 1  # set first cluster as 1 instead of 0

with warnings.catch_warnings(action="ignore"):
    df_cl_std.loc[:, "cluster"] = s_clusters
    df_cl.loc[:, 'cluster'] = s_clusters
    df_input.loc[:, "cluster"] = s_clusters

Describe clusters¶

In [14]:
display(
    plot_boxplot_by_class(
        df_input=df_cl,
        class_col="cluster",
        plots_per_line=2,
        title="Features used in K-means Clustering",
    )
)
No description has been provided for this image
In [15]:
display(
    plot_boxplot_by_class(
        df_input=df_input,
        class_col="cluster",
        plots_per_line=2,
        title="All features from input dataset",
    )
)
No description has been provided for this image
In [ ]:
 
In [ ]: