import os
import warnings
import logging

# configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)

# get warning filter policy from the environment variables
# set to "ignore" for rendering the HTMLs, or to "once" otherwise
WARNING_FILTER_POLICY = os.getenv("WARNING_FILTER_POLICY", "once")
logger.info(f"{WARNING_FILTER_POLICY = }")
warnings.filterwarnings(WARNING_FILTER_POLICY)

21:12:45 [INFO] WARNING_FILTER_POLICY = 'ignore'

import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option("display.max_columns", None)
pd.options.display.float_format = "{:,.2f}".format

from utils.constants import RANDOM_SEED
from utils.common import (
    get_data_folder_path,
    set_plotting_config,
    plot_histogram,
    plot_comparison_histograms
)

# plots configuration
sns.set_style("darkgrid")
sns.set_palette("colorblind")
set_plotting_config()
%matplotlib inline

data_path = get_data_folder_path()

df_input = pd.read_csv(os.path.join(data_path, "expenses.csv"))

fig1 = plot_histogram(
    title="Distribution of Medical Charges",
    histogram_title=f"(n = {len(df_input)})",
    df=df_input,
    plot_col="charges",
    display_name="Medical Charges",
    display_unit="USD",
    bin_size=2000,
    linewidth=1.5,
    show_legend=True,
    show_percentage=True,
    show_mean=True,
    show_median=True,
    show_zero_line=False,
    show_kde=True,
    figsize=(8, 6),
)
display(fig1)

fig2 = plot_histogram(
    title="Distribution of Medical Charges by Smoking Status",
    histogram_title=f"(n = {len(df_input)})",
    df=df_input,
    plot_col="charges",
    display_name="Medical Charges",
    display_unit="USD",
    stratify_col="smoker",
    bin_size=2000,
    linewidth=1.5,
    show_legend=True,
    show_mean=True,
    show_percentage=False,
    show_median=True,
    show_zero_line=False,
    show_kde=False,
)
display(fig2)

# create a random normal distribution with mean 1.1 (+10%) and std 0.1
np.random.seed(RANDOM_SEED)
random_change = np.random.normal(loc=1.1, scale=0.1, size=len(df_input))
# apply the random change to the charges to simulate COVID's impact
df_input["charges_new"] = df_input["charges"] * random_change
# calculate the difference
df_input["charges_diff"] = df_input["charges_new"] - df_input["charges"]

plot_comparison_histograms(
    title="COVID-19 impact on Medical Charges",
    left_title="Distribution of Medical Charges Before and After COVID-19",
    right_title="Distribution of Medical Charges Differences",
    df=df_input,
    plot_col_before="charges",
    plot_col_after="charges_new",
    plot_col_diff="charges_diff",
    display_name="Medical Charges",
    display_unit="USD",
    bin_size=2000,
    show_percentage=True,
    show_mean=True,
    show_median=True,
    figsize=(14, 6),
)

Histogram Analysis¶

1. Load Data¶

2. Plot Histograms¶

Distribution of Medical Charges¶

Distribution of Medical Charges by Smoking Status¶

COVID impact on Medical Charges¶