The aim is to extract insights from using a statistical learning model to better understand who does and does not have healthcare coverage.
# Data Management/Investigation
import pandas as pd
from pandas.api.types import CategoricalDtype # Ordering categories
import numpy as np
import missingno as miss
# Plotting libraries
from plotnine import *
import matplotlib.pyplot as plt
# For pre-processing data
from sklearn import preprocessing as pp
from sklearn.compose import ColumnTransformer
# For splits and CV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold # Cross validation
from sklearn.model_selection import cross_validate # Cross validation
from sklearn.model_selection import GridSearchCV # Cross validation + param. tuning.
# Machine learning methods
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.tree import DecisionTreeRegressor as DT_reg
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn import tree # For plotting the decision tree rules
# For evaluating our model's performance
import sklearn.metrics as m
# Pipeline to combine modeling elements
from sklearn.pipeline import Pipeline
# For model interpretation
from sklearn.inspection import (
permutation_importance,
partial_dependence,
PartialDependenceDisplay,
plot_partial_dependence
)
# Misc
import warnings
warnings.filterwarnings("ignore")
The following data contains information regarding whether or not someone has health coverage (coverage). The available predictive features capture socio-economic and descriptive factors.
Note: We'll only take a subset of this data so that the models will run faster, but feel free to explore on the entire data set.
dat = pd.read_csv("health-coverage.csv").sample(n=5000,random_state=123)
dat.head()
| coverage | age | wage | cit | mar | educ | race | |
|---|---|---|---|---|---|---|---|
| 2431 | No_Coverage | 29 | 12000 | Citizen | Never Married | HS Degree | White |
| 611 | No_Coverage | 56 | 120000 | Citizen | Divorced | HS Degree | White |
| 1779 | No_Coverage | 28 | 18200 | Citizen | Divorced | HS Degree | White |
| 15518 | Coverage | 29 | 0 | Citizen | Never Married | HS Degree | Black |
| 10878 | No_Coverage | 51 | 0 | Citizen | Widowed | HS Degree | White |
dat.dtypes
coverage object age int64 wage int64 cit object mar object educ object race object dtype: object
Convert categorical variables to categories.
for col in ['cit', 'mar', 'educ', 'race']:
dat[col] = dat[col].astype('category')
miss.matrix(dat) # None
<AxesSubplot:>
y = dat[['coverage']]
X = dat.drop(columns=['coverage'])
train_X, test_X, train_y, test_y = train_test_split(X,y,test_size = .25,random_state=123)
print(train_X.shape[0]/dat.shape[0])
print(test_X.shape[0]/dat.shape[0])
0.75 0.25
# Plot the continuous Variables
d = train_X.select_dtypes(include="int").melt()
(
ggplot(d,aes(x="value")) +
geom_histogram(bins=25) +
facet_wrap("variable",scales='free') +
theme(figure_size=(10,3),
subplots_adjust={'wspace':0.25})
)
<ggplot: (325970857)>
Looks like there is a right skew in wage. Consider log transforming, but also notice another skew once we log transform.
d = train_X.copy()
d['ln_wage'] = np.log(d['wage'] + 1)
(
ggplot(d,aes(x="ln_wage")) +
geom_histogram(bins=25) +
theme(figure_size=(10,3),
subplots_adjust={'wspace':0.25})
)
<ggplot: (325995280)>
To tackle this, let's consider converting wage into a ordinal measure where the base category is "unemployed".
# Break wage up into categories: unemployed, below the median, above the median.
median_wage = d.loc[d['wage'] > 0,'wage'].median()
d['wage_cat'] = np.where(d['wage']==0,0,np.where(d['wage'] <= median_wage,1,2))
(
ggplot(d,aes(x="wage_cat")) +
geom_histogram(bins=25) +
theme(figure_size=(10,3),
subplots_adjust={'wspace':0.25})
)
<ggplot: (326048407)>
Now, let's look at the categorical predictors
d = train_X.select_dtypes(include="category").melt()
(
ggplot(d,aes(x="value")) +
geom_bar() +
facet_wrap("variable",scales='free') +
theme(figure_size=(15,7),
subplots_adjust={'wspace':0.25,
'hspace':0.75},
axis_text_x=element_text(rotation=45, hjust=1))
)
<ggplot: (326145392)>
Things to note:
educ)mar/race)age)There are high-level transformation we want to impose on training and test data. These transformations don't require that use any information from the test data (i.e. a mean, a min, etc). Rather there are some formatting changes/transformations that will make our lives easier downstream.
educ¶Impose an ordering on the education levels.
ed_cats = ['Less than HS','HS Degree','Undergraduate Degree','Graduate Degree']
cat_types = CategoricalDtype(categories=ed_cats,ordered=True)
dat['educ'] = dat['educ'].astype(cat_types)
dat['educ'].value_counts()
HS Degree 2932 Less than HS 1042 Undergraduate Degree 647 Graduate Degree 379 Name: educ, dtype: int64
Let's convert this category into a numeric variable.
dat['educ'] = dat['educ'].cat.codes
dat['educ'].value_counts()
1 2932 0 1042 2 647 3 379 Name: educ, dtype: int64
This is the only way we can ensure the correct ordering. Don't trust an encoder (i.e.. OrdinalEncoder to get it right).
wage¶Next, let's log wage. Note that we add an offset because we have zeros in the data. (log of zero is negative infinity)
median_wage = dat.loc[dat['wage'] > 0,'wage'].median()
dat['wage'] = np.where(dat['wage']==0,0,np.where(dat['wage'] <= median_wage,1,2))
dat['wage'].value_counts()
0 2152 1 1434 2 1414 Name: wage, dtype: int64
race¶Let's collapse the racial categories. For some of the categories, there is very little representation in the data. By collapsing, we increase these bin sizes.
dat.race.value_counts()
White 3060 Black 1467 Other 177 Asian 171 Two or More 106 Amer. Ind. 7 Tribes Spec. 7 Nat. Hawaiian/Pac. Isl. 5 Name: race, dtype: int64
dat['white'] = 1*(dat['race'] == "White")
dat['black'] = 1*(dat['race'] == "Black")
# Everyone else ("other") is the baseline.
# Drop race
dat = dat.drop(columns=["race"])
mar¶Convert categories to dummies
mar_dummies = pd.get_dummies(dat.mar)
mar_dummies.columns = [c.lower().replace(" ","_") for c in mar_dummies.columns]
mar_dummies = mar_dummies.drop(['never_married'],axis=1) # Baseline
mar_dummies.head(5)
| divorced | married | separated | widowed | |
|---|---|---|---|---|
| 2431 | 0 | 0 | 0 | 0 |
| 611 | 1 | 0 | 0 | 0 |
| 1779 | 1 | 0 | 0 | 0 |
| 15518 | 0 | 0 | 0 | 0 |
| 10878 | 0 | 0 | 0 | 1 |
dat = pd.concat([dat.drop(['mar'],axis=1),mar_dummies],axis=1)
dat.head()
| coverage | age | wage | cit | educ | white | black | divorced | married | separated | widowed | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 2431 | No_Coverage | 29 | 1 | Citizen | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
| 611 | No_Coverage | 56 | 2 | Citizen | 1 | 1 | 0 | 1 | 0 | 0 | 0 |
| 1779 | No_Coverage | 28 | 1 | Citizen | 1 | 1 | 0 | 1 | 0 | 0 | 0 |
| 15518 | Coverage | 29 | 0 | Citizen | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
| 10878 | No_Coverage | 51 | 0 | Citizen | 1 | 1 | 0 | 0 | 0 | 0 | 1 |
cit¶Convert to a dummy variable.
dat['cit'].value_counts()
Citizen 4499 Non-citizen 501 Name: cit, dtype: int64
dat['cit'] = 1*(dat['cit'] == "Citizen")
dat['cit'].sum()
4499
coverage¶Finally, our outcome coverage needs to be numeric (0/1)
dat['coverage'] = 1*(dat['coverage']=='Coverage')
We learned that we had to make these changes from the training data. Now, let's re-split using the changed data.
y = dat[['coverage']]
X = dat.drop(columns=['coverage'])
train_X, test_X, train_y, test_y = train_test_split(X,y,test_size = .25,random_state=123)
# Set the folds index to ensure comparable samples
fold_generator = KFold(n_splits=10, shuffle=True,random_state=1234)
Note that we still want to scale values, but we want to do this in the pipeline since it utilizes information we'll only learn from the training data (i.e. min/max, mean, std, etc.)
pipe = Pipeline(steps=[('pre_process', pp.MinMaxScaler()),('model',None)])
As before, the grid search to tune the models is pretty tame here so that the code will run quickly. Often a good idea to explore more of the tuning parameter space when running your own code.
search_space = [
# NaiveBayes
{'model': [NB()]},
# KNN with K tuning param
{'model' : [KNN()],
'model__n_neighbors':[5,10,25,50]},
# Decision Tree with the Max Depth Param
{'model': [DT()],
'model__max_depth':[2,3,4]},
# Random forest with the N Estimators tuning param
{'model' : [RF()],
'model__max_depth':[2,3,4],
'model__n_estimators':[500,1000,1500]}
]
Put it all together in a GridSearch
search = GridSearchCV(pipe, search_space,
cv = fold_generator,
scoring='roc_auc',
n_jobs=4)
And Run
search.fit(train_X,train_y.coverage)
GridSearchCV(cv=KFold(n_splits=10, random_state=1234, shuffle=True),
estimator=Pipeline(steps=[('pre_process', MinMaxScaler()),
('model', None)]),
n_jobs=4,
param_grid=[{'model': [GaussianNB()]},
{'model': [KNeighborsClassifier()],
'model__n_neighbors': [5, 10, 25, 50]},
{'model': [DecisionTreeClassifier()],
'model__max_depth': [2, 3, 4]},
{'model': [RandomForestClassifier(max_depth=4,
n_estimators=1000)],
'model__max_depth': [2, 3, 4],
'model__n_estimators': [500, 1000, 1500]}],
scoring='roc_auc')
Best ROC AUC.
search.best_score_
0.8118115138376221
search.best_params_
{'model': RandomForestClassifier(max_depth=4, n_estimators=1000),
'model__max_depth': 4,
'model__n_estimators': 1000}
rf_mod = search.best_estimator_
m.roc_auc_score(train_y,rf_mod.predict_proba(train_X)[:,1])
0.8195140444842769
m.accuracy_score(train_y,rf_mod.predict(train_X))
0.7354666666666667
Permute the features to determine importance. Note here that I only do this 5 times for the sake of runtime, but you'd want to do this more (e.g. 25 times)
vi = permutation_importance(rf_mod,train_X,train_y,n_repeats=5)
Organize the output as a data frame.
# Organize as a data frame
vi_dat = pd.DataFrame(dict(variable=train_X.columns,
vi = vi['importances_mean'],
std = vi['importances_std']))
# Generate intervals
vi_dat['low'] = vi_dat['vi'] - 2*vi_dat['std']
vi_dat['high'] = vi_dat['vi'] + 2*vi_dat['std']
# But in order from most to least important
vi_dat = vi_dat.sort_values(by="vi",ascending=False).reset_index(drop=True)
vi_dat
| variable | vi | std | low | high | |
|---|---|---|---|---|---|
| 0 | age | 0.104693 | 0.003833 | 0.097027 | 0.112360 |
| 1 | wage | 0.048747 | 0.002189 | 0.044369 | 0.053124 |
| 2 | educ | 0.048107 | 0.004822 | 0.038462 | 0.057752 |
| 3 | cit | 0.010240 | 0.001813 | 0.006613 | 0.013867 |
| 4 | married | 0.007360 | 0.002885 | 0.001590 | 0.013130 |
| 5 | white | 0.003893 | 0.001476 | 0.000941 | 0.006846 |
| 6 | black | 0.002507 | 0.001200 | 0.000107 | 0.004906 |
| 7 | widowed | 0.001973 | 0.000802 | 0.000370 | 0.003577 |
| 8 | divorced | -0.000267 | 0.000239 | -0.000744 | 0.000210 |
| 9 | separated | -0.000480 | 0.000311 | -0.001102 | 0.000142 |
Visualize
# Plot
(
ggplot(vi_dat,
aes(x="variable",y="vi")) +
geom_col(alpha=.5) +
geom_point() +
geom_errorbar(aes(ymin="low",ymax="high"),width=.2) +
theme_bw() +
scale_x_discrete(limits=vi_dat.variable.tolist()) +
coord_flip() +
labs(y="Reduction in AUC ROC",x="")
)
<ggplot: (326147615)>
# Target specific features
features = ['age','educ','wage','married','cit']
# Calculate the partial dependency
fig, ax = plt.subplots(figsize=(15, 4))
display = plot_partial_dependence(
rf_mod, train_X, features,n_cols=5,
n_jobs=4, grid_resolution=30,ax=ax
)
# display.figure_.set_figwidth(15)
# display.figure_.set_figheight(4)
Interaction Partial Dependency Plots (2D)
# Feed in the ineraction as a nested list
interacted_features = [['age','educ'],['age','wage'],['educ','wage']]
# Then business as usual when plotting
fig, ax = plt.subplots(figsize=(12, 4))
display = plot_partial_dependence(
rf_mod, train_X, interacted_features,
n_cols=3,n_jobs=4, grid_resolution=20,ax=ax
)
fig.tight_layout()
features = ['age','educ','wage','married','cit']
fig, ax = plt.subplots(figsize=(15, 4))
display = PartialDependenceDisplay.from_estimator(
rf_mod,
train_X,
features,
kind="both", # "average" = just PDP, "individual" = just ICE
subsample=50,
n_jobs=3,
grid_resolution=20,
random_state=0,
ice_lines_kw={"color": "tab:blue", "alpha": 0.2, "linewidth": 0.5},
pd_line_kw={"color": "tab:orange", "linestyle": "--"},
n_cols=len(features),
ax = ax
)
display.figure_.subplots_adjust(hspace=0.3)
(1) Generate a vector of predictions (specifically, predicted probabilities since this is a classification problem).
pr_y = rf_mod.predict_proba(train_X)[:,rf_mod.classes_ == 1]
pr_y
array([[0.33923212],
[0.82221132],
[0.35483412],
...,
[0.35561205],
[0.57929831],
[0.44145906]])
(2) Fit the surrogate model on the predictions
surrogate_model = DT_reg(max_depth=3)
surrogate_model.fit(train_X,pr_y)
DecisionTreeRegressor(max_depth=3)
(3) Examine the model fit ($R^2$)
m.r2_score(pr_y,surrogate_model.predict(train_X)).round(2)
0.87
(4) Plot the tree and interpret
# Plot the tree
plt.figure(figsize=(10,8),dpi=200)
rules = tree.plot_tree(surrogate_model,feature_names=train_X.columns,fontsize=7)