The aim is to extract insights from using a statistical learning model to better understand who does and does not have healthcare coverage.
# Data Management/Investigation
import pandas as pd
from pandas.api.types import CategoricalDtype # Ordering categories
import numpy as np
import missingno as miss
# Plotting libraries
from plotnine import *
import matplotlib.pyplot as plt
# For pre-processing data
from sklearn import preprocessing as pp
from sklearn.compose import ColumnTransformer
# For splits and CV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold # Cross validation
from sklearn.model_selection import cross_validate # Cross validation
from sklearn.model_selection import GridSearchCV # Cross validation + param. tuning.
# Machine learning methods
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.tree import DecisionTreeRegressor as DT_reg
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn import tree # For plotting the decision tree rules
# For evaluating our model's performance
import sklearn.metrics as m
# Pipeline to combine modeling elements
from sklearn.pipeline import Pipeline
# For model interpretation
from sklearn.inspection import (
permutation_importance,
partial_dependence,
PartialDependenceDisplay,
plot_partial_dependence
)
# Misc
import warnings
warnings.filterwarnings("ignore")
The following data contains information regarding whether or not someone has health coverage (coverage
). The available predictive features capture socio-economic and descriptive factors.
Note: We'll only take a subset of this data so that the models will run faster, but feel free to explore on the entire data set.
dat = pd.read_csv("health-coverage.csv").sample(n=5000,random_state=123)
dat.head()
coverage | age | wage | cit | mar | educ | race | |
---|---|---|---|---|---|---|---|
2431 | No_Coverage | 29 | 12000 | Citizen | Never Married | HS Degree | White |
611 | No_Coverage | 56 | 120000 | Citizen | Divorced | HS Degree | White |
1779 | No_Coverage | 28 | 18200 | Citizen | Divorced | HS Degree | White |
15518 | Coverage | 29 | 0 | Citizen | Never Married | HS Degree | Black |
10878 | No_Coverage | 51 | 0 | Citizen | Widowed | HS Degree | White |
dat.dtypes
coverage object age int64 wage int64 cit object mar object educ object race object dtype: object
Convert categorical variables to categories.
for col in ['cit', 'mar', 'educ', 'race']:
dat[col] = dat[col].astype('category')
miss.matrix(dat) # None
<AxesSubplot:>
y = dat[['coverage']]
X = dat.drop(columns=['coverage'])
train_X, test_X, train_y, test_y = train_test_split(X,y,test_size = .25,random_state=123)
print(train_X.shape[0]/dat.shape[0])
print(test_X.shape[0]/dat.shape[0])
0.75 0.25
# Plot the continuous Variables
d = train_X.select_dtypes(include="int").melt()
(
ggplot(d,aes(x="value")) +
geom_histogram(bins=25) +
facet_wrap("variable",scales='free') +
theme(figure_size=(10,3),
subplots_adjust={'wspace':0.25})
)
<ggplot: (325970857)>
Looks like there is a right skew in wage. Consider log transforming, but also notice another skew once we log transform.
d = train_X.copy()
d['ln_wage'] = np.log(d['wage'] + 1)
(
ggplot(d,aes(x="ln_wage")) +
geom_histogram(bins=25) +
theme(figure_size=(10,3),
subplots_adjust={'wspace':0.25})
)
<ggplot: (325995280)>
To tackle this, let's consider converting wage into a ordinal measure where the base category is "unemployed".
# Break wage up into categories: unemployed, below the median, above the median.
median_wage = d.loc[d['wage'] > 0,'wage'].median()
d['wage_cat'] = np.where(d['wage']==0,0,np.where(d['wage'] <= median_wage,1,2))
(
ggplot(d,aes(x="wage_cat")) +
geom_histogram(bins=25) +
theme(figure_size=(10,3),
subplots_adjust={'wspace':0.25})
)
<ggplot: (326048407)>
Now, let's look at the categorical predictors
d = train_X.select_dtypes(include="category").melt()
(
ggplot(d,aes(x="value")) +
geom_bar() +
facet_wrap("variable",scales='free') +
theme(figure_size=(15,7),
subplots_adjust={'wspace':0.25,
'hspace':0.75},
axis_text_x=element_text(rotation=45, hjust=1))
)
<ggplot: (326145392)>
Things to note:
educ
)mar
/race
)age
)There are high-level transformation we want to impose on training and test data. These transformations don't require that use any information from the test data (i.e. a mean, a min, etc). Rather there are some formatting changes/transformations that will make our lives easier downstream.
educ
¶Impose an ordering on the education levels.
ed_cats = ['Less than HS','HS Degree','Undergraduate Degree','Graduate Degree']
cat_types = CategoricalDtype(categories=ed_cats,ordered=True)
dat['educ'] = dat['educ'].astype(cat_types)
dat['educ'].value_counts()
HS Degree 2932 Less than HS 1042 Undergraduate Degree 647 Graduate Degree 379 Name: educ, dtype: int64
Let's convert this category into a numeric variable.
dat['educ'] = dat['educ'].cat.codes
dat['educ'].value_counts()
1 2932 0 1042 2 647 3 379 Name: educ, dtype: int64
This is the only way we can ensure the correct ordering. Don't trust an encoder (i.e.. OrdinalEncoder
to get it right).
wage
¶Next, let's log wage
. Note that we add an offset because we have zeros in the data. (log of zero is negative infinity)
median_wage = dat.loc[dat['wage'] > 0,'wage'].median()
dat['wage'] = np.where(dat['wage']==0,0,np.where(dat['wage'] <= median_wage,1,2))
dat['wage'].value_counts()
0 2152 1 1434 2 1414 Name: wage, dtype: int64
race
¶Let's collapse the racial categories. For some of the categories, there is very little representation in the data. By collapsing, we increase these bin sizes.
dat.race.value_counts()
White 3060 Black 1467 Other 177 Asian 171 Two or More 106 Amer. Ind. 7 Tribes Spec. 7 Nat. Hawaiian/Pac. Isl. 5 Name: race, dtype: int64
dat['white'] = 1*(dat['race'] == "White")
dat['black'] = 1*(dat['race'] == "Black")
# Everyone else ("other") is the baseline.
# Drop race
dat = dat.drop(columns=["race"])
mar
¶Convert categories to dummies
mar_dummies = pd.get_dummies(dat.mar)
mar_dummies.columns = [c.lower().replace(" ","_") for c in mar_dummies.columns]
mar_dummies = mar_dummies.drop(['never_married'],axis=1) # Baseline
mar_dummies.head(5)
divorced | married | separated | widowed | |
---|---|---|---|---|
2431 | 0 | 0 | 0 | 0 |
611 | 1 | 0 | 0 | 0 |
1779 | 1 | 0 | 0 | 0 |
15518 | 0 | 0 | 0 | 0 |
10878 | 0 | 0 | 0 | 1 |
dat = pd.concat([dat.drop(['mar'],axis=1),mar_dummies],axis=1)
dat.head()
coverage | age | wage | cit | educ | white | black | divorced | married | separated | widowed | |
---|---|---|---|---|---|---|---|---|---|---|---|
2431 | No_Coverage | 29 | 1 | Citizen | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
611 | No_Coverage | 56 | 2 | Citizen | 1 | 1 | 0 | 1 | 0 | 0 | 0 |
1779 | No_Coverage | 28 | 1 | Citizen | 1 | 1 | 0 | 1 | 0 | 0 | 0 |
15518 | Coverage | 29 | 0 | Citizen | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
10878 | No_Coverage | 51 | 0 | Citizen | 1 | 1 | 0 | 0 | 0 | 0 | 1 |
cit
¶Convert to a dummy variable.
dat['cit'].value_counts()
Citizen 4499 Non-citizen 501 Name: cit, dtype: int64
dat['cit'] = 1*(dat['cit'] == "Citizen")
dat['cit'].sum()
4499
coverage
¶Finally, our outcome coverage
needs to be numeric (0/1)
dat['coverage'] = 1*(dat['coverage']=='Coverage')
We learned that we had to make these changes from the training data. Now, let's re-split using the changed data.
y = dat[['coverage']]
X = dat.drop(columns=['coverage'])
train_X, test_X, train_y, test_y = train_test_split(X,y,test_size = .25,random_state=123)
# Set the folds index to ensure comparable samples
fold_generator = KFold(n_splits=10, shuffle=True,random_state=1234)
Note that we still want to scale values, but we want to do this in the pipeline since it utilizes information we'll only learn from the training data (i.e. min/max, mean, std, etc.)
pipe = Pipeline(steps=[('pre_process', pp.MinMaxScaler()),('model',None)])
As before, the grid search to tune the models is pretty tame here so that the code will run quickly. Often a good idea to explore more of the tuning parameter space when running your own code.
search_space = [
# NaiveBayes
{'model': [NB()]},
# KNN with K tuning param
{'model' : [KNN()],
'model__n_neighbors':[5,10,25,50]},
# Decision Tree with the Max Depth Param
{'model': [DT()],
'model__max_depth':[2,3,4]},
# Random forest with the N Estimators tuning param
{'model' : [RF()],
'model__max_depth':[2,3,4],
'model__n_estimators':[500,1000,1500]}
]
Put it all together in a GridSearch
search = GridSearchCV(pipe, search_space,
cv = fold_generator,
scoring='roc_auc',
n_jobs=4)
And Run
search.fit(train_X,train_y.coverage)
GridSearchCV(cv=KFold(n_splits=10, random_state=1234, shuffle=True), estimator=Pipeline(steps=[('pre_process', MinMaxScaler()), ('model', None)]), n_jobs=4, param_grid=[{'model': [GaussianNB()]}, {'model': [KNeighborsClassifier()], 'model__n_neighbors': [5, 10, 25, 50]}, {'model': [DecisionTreeClassifier()], 'model__max_depth': [2, 3, 4]}, {'model': [RandomForestClassifier(max_depth=4, n_estimators=1000)], 'model__max_depth': [2, 3, 4], 'model__n_estimators': [500, 1000, 1500]}], scoring='roc_auc')
Best ROC AUC.
search.best_score_
0.8118115138376221
search.best_params_
{'model': RandomForestClassifier(max_depth=4, n_estimators=1000), 'model__max_depth': 4, 'model__n_estimators': 1000}
rf_mod = search.best_estimator_
m.roc_auc_score(train_y,rf_mod.predict_proba(train_X)[:,1])
0.8195140444842769
m.accuracy_score(train_y,rf_mod.predict(train_X))
0.7354666666666667
Permute the features to determine importance. Note here that I only do this 5 times for the sake of runtime, but you'd want to do this more (e.g. 25 times)
vi = permutation_importance(rf_mod,train_X,train_y,n_repeats=5)
Organize the output as a data frame.
# Organize as a data frame
vi_dat = pd.DataFrame(dict(variable=train_X.columns,
vi = vi['importances_mean'],
std = vi['importances_std']))
# Generate intervals
vi_dat['low'] = vi_dat['vi'] - 2*vi_dat['std']
vi_dat['high'] = vi_dat['vi'] + 2*vi_dat['std']
# But in order from most to least important
vi_dat = vi_dat.sort_values(by="vi",ascending=False).reset_index(drop=True)
vi_dat
variable | vi | std | low | high | |
---|---|---|---|---|---|
0 | age | 0.104693 | 0.003833 | 0.097027 | 0.112360 |
1 | wage | 0.048747 | 0.002189 | 0.044369 | 0.053124 |
2 | educ | 0.048107 | 0.004822 | 0.038462 | 0.057752 |
3 | cit | 0.010240 | 0.001813 | 0.006613 | 0.013867 |
4 | married | 0.007360 | 0.002885 | 0.001590 | 0.013130 |
5 | white | 0.003893 | 0.001476 | 0.000941 | 0.006846 |
6 | black | 0.002507 | 0.001200 | 0.000107 | 0.004906 |
7 | widowed | 0.001973 | 0.000802 | 0.000370 | 0.003577 |
8 | divorced | -0.000267 | 0.000239 | -0.000744 | 0.000210 |
9 | separated | -0.000480 | 0.000311 | -0.001102 | 0.000142 |
Visualize
# Plot
(
ggplot(vi_dat,
aes(x="variable",y="vi")) +
geom_col(alpha=.5) +
geom_point() +
geom_errorbar(aes(ymin="low",ymax="high"),width=.2) +
theme_bw() +
scale_x_discrete(limits=vi_dat.variable.tolist()) +
coord_flip() +
labs(y="Reduction in AUC ROC",x="")
)
<ggplot: (326147615)>
# Target specific features
features = ['age','educ','wage','married','cit']
# Calculate the partial dependency
fig, ax = plt.subplots(figsize=(15, 4))
display = plot_partial_dependence(
rf_mod, train_X, features,n_cols=5,
n_jobs=4, grid_resolution=30,ax=ax
)
# display.figure_.set_figwidth(15)
# display.figure_.set_figheight(4)
Interaction Partial Dependency Plots (2D)
# Feed in the ineraction as a nested list
interacted_features = [['age','educ'],['age','wage'],['educ','wage']]
# Then business as usual when plotting
fig, ax = plt.subplots(figsize=(12, 4))
display = plot_partial_dependence(
rf_mod, train_X, interacted_features,
n_cols=3,n_jobs=4, grid_resolution=20,ax=ax
)
fig.tight_layout()
features = ['age','educ','wage','married','cit']
fig, ax = plt.subplots(figsize=(15, 4))
display = PartialDependenceDisplay.from_estimator(
rf_mod,
train_X,
features,
kind="both", # "average" = just PDP, "individual" = just ICE
subsample=50,
n_jobs=3,
grid_resolution=20,
random_state=0,
ice_lines_kw={"color": "tab:blue", "alpha": 0.2, "linewidth": 0.5},
pd_line_kw={"color": "tab:orange", "linestyle": "--"},
n_cols=len(features),
ax = ax
)
display.figure_.subplots_adjust(hspace=0.3)
(1) Generate a vector of predictions (specifically, predicted probabilities since this is a classification problem).
pr_y = rf_mod.predict_proba(train_X)[:,rf_mod.classes_ == 1]
pr_y
array([[0.33923212], [0.82221132], [0.35483412], ..., [0.35561205], [0.57929831], [0.44145906]])
(2) Fit the surrogate model on the predictions
surrogate_model = DT_reg(max_depth=3)
surrogate_model.fit(train_X,pr_y)
DecisionTreeRegressor(max_depth=3)
(3) Examine the model fit ($R^2$)
m.r2_score(pr_y,surrogate_model.predict(train_X)).round(2)
0.87
(4) Plot the tree and interpret
# Plot the tree
plt.figure(figsize=(10,8),dpi=200)
rules = tree.plot_tree(surrogate_model,feature_names=train_X.columns,fontsize=7)