import os
import warnings
import logging

# configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)

# get warning filter policy from the environment variables
# set to "ignore" for rendering the HTMLs, or to "once" otherwise
WARNING_FILTER_POLICY = os.getenv("WARNING_FILTER_POLICY", "once")
logger.info(f"{WARNING_FILTER_POLICY = }")
warnings.filterwarnings(WARNING_FILTER_POLICY)

21:12:36 [INFO] WARNING_FILTER_POLICY = 'ignore'

import numpy as np
import pandas as pd
import seaborn as sns

from kneed import KneeLocator
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:,.2f}".format

from utils.constants import RANDOM_SEED
from utils.common import get_data_folder_path, set_plotting_config, plot_boxplot_by_class
from utils.clustering import search_kmeans, plot_kmeans_search

# plots configuration
sns.set_style("darkgrid")
sns.set_palette("colorblind")
set_plotting_config()
%matplotlib inline

data_path = get_data_folder_path()

df_input = pd.read_csv(os.path.join(data_path, 'expenses.csv'))

# convert categorical columns into numerical
df_input["is_male"] = (df_input["sex"] == "male").astype(np.int8)
df_input["is_smoker"] = (df_input["smoker"] == "yes").astype(np.int8)
df_input = (
    pd.concat([
        df_input.drop(columns=["sex", "smoker", "region"]),
        pd.get_dummies(df_input["region"], prefix="region", dtype=np.int8)
    ], axis=1)
)

# define columns for clustering
cluster_cols = [
    col for col in df_input.columns
    # remove the target column to simulate an unsupervised problem
    if col != "charges"
    # remove one-hot-encoded region columns to simplify the clustering process
    and not col.startswith("region_")
]
df_cl = df_input[cluster_cols]

# Standardize X_train and X_test
stdscaler = StandardScaler()
df_cl_std = pd.DataFrame(stdscaler.fit_transform(df_cl), columns=df_cl.columns, index=df_cl.index)

df_kmeans = search_kmeans(df_cl_std, max_n_clusters=15)

# determine the ideal number of cluster using the "Elbow Method"
# using the kneed package which implements the Kneedle algorithm
kl = KneeLocator(
    x=df_kmeans["n_clusters"].values,
    y=df_kmeans["wcss"].values,
    curve="convex",
    direction="decreasing"
)
print(f'Elbow Method: best number of clusters is {kl.elbow}')

Elbow Method: best number of clusters is 7

display(plot_kmeans_search(df_kmeans=df_kmeans, elbow=kl.elbow))

# fit K-means with selected number of clusters
kmeans_model = KMeans(n_clusters=kl.elbow, verbose=0, random_state=RANDOM_SEED)
kmeans_model.fit(df_cl_std)

KMeans(n_clusters=np.int64(7), random_state=42)

KMeans(n_clusters=np.int64(7), random_state=42)

s_clusters = pd.Series(data=kmeans_model.labels_, name="cluster", index=df_cl_std.index)
s_clusters += 1  # set first cluster as 1 instead of 0

with warnings.catch_warnings(action="ignore"):
    df_cl_std.loc[:, "cluster"] = s_clusters
    df_cl.loc[:, 'cluster'] = s_clusters
    df_input.loc[:, "cluster"] = s_clusters

display(
    plot_boxplot_by_class(
        df_input=df_cl,
        class_col="cluster",
        plots_per_line=2,
        title="Features used in K-means Clustering",
    )
)

display(
    plot_boxplot_by_class(
        df_input=df_input,
        class_col="cluster",
        plots_per_line=2,
        title="All features from input dataset",
    )
)

Clustering¶

1. Preprocessing¶

Load data¶

Scale data (if necessary)¶

2. K-means Clustering¶

Find best number of clusters¶

Fit final model¶

Describe clusters¶