# Data Management/Investigation
import pandas as pd
from pandas.api.types import CategoricalDtype # Ordering categories
import numpy as np
import missingno as miss

# Plotting libraries
from plotnine import *
import matplotlib.pyplot as plt

# For pre-processing data 
from sklearn import preprocessing as pp 
from sklearn.compose import ColumnTransformer 

# For splits and CV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold # Cross validation 
from sklearn.model_selection import cross_validate # Cross validation 
from sklearn.model_selection import GridSearchCV # Cross validation + param. tuning.

# Machine learning methods 
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.tree import DecisionTreeRegressor as DT_reg
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn import tree # For plotting the decision tree rules

# For evaluating our model's performance
import sklearn.metrics as m

# Pipeline to combine modeling elements
from sklearn.pipeline import Pipeline

# For model interpretation
from sklearn.inspection import (
    permutation_importance,
    partial_dependence, 
    PartialDependenceDisplay, 
    plot_partial_dependence
)

# Misc
import warnings
warnings.filterwarnings("ignore")


dat = pd.read_csv("health-coverage.csv").sample(n=5000,random_state=123)


dat.head()


dat.dtypes

coverage    object
age          int64
wage         int64
cit         object
mar         object
educ        object
race        object
dtype: object


for col in ['cit', 'mar', 'educ', 'race']:
    dat[col] = dat[col].astype('category')


miss.matrix(dat) # None

<AxesSubplot:>


y = dat[['coverage']]
X = dat.drop(columns=['coverage'])
train_X, test_X, train_y, test_y = train_test_split(X,y,test_size = .25,random_state=123)


print(train_X.shape[0]/dat.shape[0])
print(test_X.shape[0]/dat.shape[0])

0.75
0.25


# Plot the continuous Variables 
d = train_X.select_dtypes(include="int").melt()
(
    ggplot(d,aes(x="value")) +
    geom_histogram(bins=25) +
    facet_wrap("variable",scales='free') +
    theme(figure_size=(10,3),
          subplots_adjust={'wspace':0.25})
)

<ggplot: (325970857)>


d = train_X.copy()
d['ln_wage'] =  np.log(d['wage'] + 1)
(
    ggplot(d,aes(x="ln_wage")) +
    geom_histogram(bins=25) +
    theme(figure_size=(10,3),
          subplots_adjust={'wspace':0.25})
)

<ggplot: (325995280)>


# Break wage up into categories: unemployed, below the median, above the median.
median_wage = d.loc[d['wage'] > 0,'wage'].median()
d['wage_cat'] =  np.where(d['wage']==0,0,np.where(d['wage'] <= median_wage,1,2))
(
    ggplot(d,aes(x="wage_cat")) +
    geom_histogram(bins=25) +
    theme(figure_size=(10,3),
          subplots_adjust={'wspace':0.25})
)

<ggplot: (326048407)>


d = train_X.select_dtypes(include="category").melt()
(
    ggplot(d,aes(x="value")) +
    geom_bar() +
    facet_wrap("variable",scales='free') +
    theme(figure_size=(15,7),
          subplots_adjust={'wspace':0.25,
                           'hspace':0.75},
         axis_text_x=element_text(rotation=45, hjust=1))
)

<ggplot: (326145392)>


ed_cats = ['Less than HS','HS Degree','Undergraduate Degree','Graduate Degree']
cat_types = CategoricalDtype(categories=ed_cats,ordered=True)
dat['educ'] = dat['educ'].astype(cat_types)
dat['educ'].value_counts()

HS Degree               2932
Less than HS            1042
Undergraduate Degree     647
Graduate Degree          379
Name: educ, dtype: int64


dat['educ'] = dat['educ'].cat.codes
dat['educ'].value_counts()

1    2932
0    1042
2     647
3     379
Name: educ, dtype: int64


median_wage = dat.loc[dat['wage'] > 0,'wage'].median()
dat['wage'] =  np.where(dat['wage']==0,0,np.where(dat['wage'] <= median_wage,1,2))


dat['wage'].value_counts()

0    2152
1    1434
2    1414
Name: wage, dtype: int64


dat.race.value_counts()

White                      3060
Black                      1467
Other                       177
Asian                       171
Two or More                 106
Amer. Ind.                    7
Tribes Spec.                  7
Nat. Hawaiian/Pac. Isl.       5
Name: race, dtype: int64


dat['white'] = 1*(dat['race'] == "White")
dat['black'] = 1*(dat['race'] == "Black")
# Everyone else ("other") is the baseline.


# Drop race
dat = dat.drop(columns=["race"])


mar_dummies = pd.get_dummies(dat.mar)
mar_dummies.columns = [c.lower().replace(" ","_") for c in mar_dummies.columns]
mar_dummies = mar_dummies.drop(['never_married'],axis=1) # Baseline
mar_dummies.head(5)


dat = pd.concat([dat.drop(['mar'],axis=1),mar_dummies],axis=1)
dat.head()


dat['cit'].value_counts()

Citizen        4499
Non-citizen     501
Name: cit, dtype: int64


dat['cit'] = 1*(dat['cit'] == "Citizen")


dat['cit'].sum()

4499


dat['coverage'] = 1*(dat['coverage']=='Coverage')


y = dat[['coverage']]
X = dat.drop(columns=['coverage'])
train_X, test_X, train_y, test_y = train_test_split(X,y,test_size = .25,random_state=123)


# Set the folds index to ensure comparable samples
fold_generator = KFold(n_splits=10, shuffle=True,random_state=1234)


pipe = Pipeline(steps=[('pre_process', pp.MinMaxScaler()),('model',None)])


search_space = [
    
    # NaiveBayes
    {'model': [NB()]},
    
    # KNN with K tuning param
    {'model' : [KNN()],
     'model__n_neighbors':[5,10,25,50]},
    
    # Decision Tree with the Max Depth Param
    {'model': [DT()],
     'model__max_depth':[2,3,4]},
    
    # Random forest with the N Estimators tuning param
    {'model' : [RF()],
    'model__max_depth':[2,3,4],
    'model__n_estimators':[500,1000,1500]}
    
]


search = GridSearchCV(pipe, search_space, 
                      cv = fold_generator,
                      scoring='roc_auc',
                      n_jobs=4)


search.fit(train_X,train_y.coverage)

GridSearchCV(cv=KFold(n_splits=10, random_state=1234, shuffle=True),
             estimator=Pipeline(steps=[('pre_process', MinMaxScaler()),
                                       ('model', None)]),
             n_jobs=4,
             param_grid=[{'model': [GaussianNB()]},
                         {'model': [KNeighborsClassifier()],
                          'model__n_neighbors': [5, 10, 25, 50]},
                         {'model': [DecisionTreeClassifier()],
                          'model__max_depth': [2, 3, 4]},
                         {'model': [RandomForestClassifier(max_depth=4,
                                                           n_estimators=1000)],
                          'model__max_depth': [2, 3, 4],
                          'model__n_estimators': [500, 1000, 1500]}],
             scoring='roc_auc')


search.best_score_

0.8118115138376221


search.best_params_

{'model': RandomForestClassifier(max_depth=4, n_estimators=1000),
 'model__max_depth': 4,
 'model__n_estimators': 1000}


rf_mod = search.best_estimator_


m.roc_auc_score(train_y,rf_mod.predict_proba(train_X)[:,1])

0.8195140444842769


m.accuracy_score(train_y,rf_mod.predict(train_X))

0.7354666666666667


vi = permutation_importance(rf_mod,train_X,train_y,n_repeats=5)


# Organize as a data frame 
vi_dat = pd.DataFrame(dict(variable=train_X.columns,
                           vi = vi['importances_mean'],
                           std = vi['importances_std']))

# Generate intervals
vi_dat['low'] = vi_dat['vi'] - 2*vi_dat['std']
vi_dat['high'] = vi_dat['vi'] + 2*vi_dat['std']

# But in order from most to least important
vi_dat = vi_dat.sort_values(by="vi",ascending=False).reset_index(drop=True)


vi_dat


# Plot
(
    ggplot(vi_dat,
          aes(x="variable",y="vi")) +
    geom_col(alpha=.5) +
    geom_point() +
    geom_errorbar(aes(ymin="low",ymax="high"),width=.2) +
    theme_bw() +
    scale_x_discrete(limits=vi_dat.variable.tolist()) +
    coord_flip() +
    labs(y="Reduction in AUC ROC",x="")
)

<ggplot: (326147615)>


# Target specific features
features = ['age','educ','wage','married','cit']

# Calculate the partial dependency
fig, ax = plt.subplots(figsize=(15, 4))
display = plot_partial_dependence(
    rf_mod, train_X, features,n_cols=5,
    n_jobs=4, grid_resolution=30,ax=ax
)
# display.figure_.set_figwidth(15)
# display.figure_.set_figheight(4)


# Feed in the ineraction as a nested list
interacted_features = [['age','educ'],['age','wage'],['educ','wage']] 

# Then business as usual when plotting
fig, ax = plt.subplots(figsize=(12, 4))
display = plot_partial_dependence(
    rf_mod, train_X, interacted_features,
    n_cols=3,n_jobs=4, grid_resolution=20,ax=ax
)
fig.tight_layout()


features = ['age','educ','wage','married','cit']
fig, ax = plt.subplots(figsize=(15, 4))
display = PartialDependenceDisplay.from_estimator(
    rf_mod,
    train_X,
    features,
    kind="both", # "average" = just PDP, "individual" = just ICE
    subsample=50,
    n_jobs=3,
    grid_resolution=20,
    random_state=0,
    ice_lines_kw={"color": "tab:blue", "alpha": 0.2, "linewidth": 0.5},
    pd_line_kw={"color": "tab:orange", "linestyle": "--"},
    n_cols=len(features),
    ax = ax
)
display.figure_.subplots_adjust(hspace=0.3)


pr_y = rf_mod.predict_proba(train_X)[:,rf_mod.classes_ == 1]
pr_y

array([[0.33923212],
       [0.82221132],
       [0.35483412],
       ...,
       [0.35561205],
       [0.57929831],
       [0.44145906]])


surrogate_model = DT_reg(max_depth=3)
surrogate_model.fit(train_X,pr_y)

DecisionTreeRegressor(max_depth=3)


m.r2_score(pr_y,surrogate_model.predict(train_X)).round(2)

0.87


# Plot the tree
plt.figure(figsize=(10,8),dpi=200)
rules = tree.plot_tree(surrogate_model,feature_names=train_X.columns,fontsize=7)

	divorced	widowed
2431	0	0
611	1	0
1779	1	0
15518	0	0
10878	0	1

	variable	vi	std	low	high
0	age	0.104693	0.003833	0.097027	0.112360
1	wage	0.048747	0.002189	0.044369	0.053124
2	educ	0.048107	0.004822	0.038462	0.057752
3	cit	0.010240	0.001813	0.006613	0.013867
4	married	0.007360	0.002885	0.001590	0.013130
5	white	0.003893	0.001476	0.000941	0.006846
6	black	0.002507	0.001200	0.000107	0.004906
7	widowed	0.001973	0.000802	0.000370	0.003577
8	divorced	-0.000267	0.000239	-0.000744	0.000210
9	separated	-0.000480	0.000311	-0.001102	0.000142

PPOL564 | Data Science 1 | Foundations

Interpretable Machine Learning Walkthrough

Overview¶

Dependencies¶

Data¶

Missingness?¶

Split¶

Explore Training Set¶

Preprocessing¶

High-Level Preprocessing¶

`educ`¶

`wage`¶

`race`¶

`mar`¶

`cit`¶

`coverage`¶

Re-split¶

Train Models¶

Cross Validation¶

Initialize Pipeline¶

Select Models & Tuning Parameters¶

Run Models¶

Performance¶

Model Interpretation¶

Permutation Importance¶

Partial Dependency Plots¶

ICE Plots¶

Global Surrogate Models¶

	coverage	age	wage	cit	mar	educ	race
2431	No_Coverage	29	12000	Citizen	Never Married	HS Degree	White
611	No_Coverage	56	120000	Citizen	Divorced	HS Degree	White
1779	No_Coverage	28	18200	Citizen	Divorced	HS Degree	White
15518	Coverage	29	0	Citizen	Never Married	HS Degree	Black
10878	No_Coverage	51	0	Citizen	Widowed	HS Degree	White

PPOL564 | Data Science 1 | Foundations

Interpretable Machine Learning Walkthrough

Overview¶

Dependencies¶

Data¶

Missingness?¶

Split¶

Explore Training Set¶

Preprocessing¶

High-Level Preprocessing¶

educ¶

wage¶

race¶

mar¶

cit¶

coverage¶

Re-split¶

Train Models¶

Cross Validation¶

Initialize Pipeline¶

Select Models & Tuning Parameters¶

Run Models¶

Performance¶

Model Interpretation¶

Permutation Importance¶

Partial Dependency Plots¶

ICE Plots¶

Global Surrogate Models¶

`educ`¶

`wage`¶

`race`¶

`mar`¶

`cit`¶

`coverage`¶