In the Asynchronous Lecture
In the Synchronous Lecture
sklearn
framework (plus touch on global surrogate models).If you have any questions while watching the pre-recorded material, be sure to write them down and to bring them up during the synchronous portion of the lecture.
.ipynb
Notebook.health-coverage.csv
.The following tabs contain pre-recorded lecture materials for class this week. Please review these materials prior to the synchronous lecture.
Total time: Approx. 1 hour and 7 minutes
'''
Variable Importance
'''
# Data processing
import pandas as pd
import numpy as np
# Learning Models
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.linear_model import LinearRegression as LM
import sklearn.metrics as m
# Ploting
from plotnine import *
# Suppress warnings
import warnings
'ignore')
warnings.filterwarnings(
# %% Generate Synthetic Data -----------------------------------------
def gen_data(N=1000,seed=1234):
# Set seed
np.random.seed(seed)
# Generate predictors
= np.random.normal(loc=0,scale=1, size = N)
x1 = np.random.normal(loc=0,scale=1, size = N)
x2 = np.random.normal(loc=0,scale=1, size = N)
x3 = np.random.normal(loc=0,scale=1, size = N) # irreducible error
e
# Generate outcome (which is a function of the predictors -- x3 intentionally left out)
= -2*np.cos(x1)*np.sin(x2) - x1 + x2 + -.75*x2**2 + e
y
# Organize predictors
= np.stack([x1,x2,x3],axis=1)
X
# Return the data
return X,y
# %% -----------------------------------------
# Generate the sample of the data.
= gen_data(N=1000)
X, y
# Plot at the synthetic data
= pd.DataFrame(dict(y=y,x1=X[:,0],x2=X[:,1],x3=X[:,2]))
D
(='y'),
ggplot(D.melt(id_vars="value",y='y')) +
aes(x=.25) +
geom_point(alpha'variable') +
facet_wrap(=(10,5))
theme(figure_size
)
# %% -----------------------------------------
# Build a model
# Linear model
= LM()
mod_lm
mod_lm.fit(X,y)
# Random Forest Model
= RF()
mod_rf
mod_rf.fit(X,y)
# Performance: RF > LM
m.mean_squared_error(y,mod_lm.predict(X))
m.mean_squared_error(y,mod_rf.predict(X))
# Performance on future data.
= gen_data(N=1000,seed=2000)
test_X, test_y
m.mean_squared_error(test_y,mod_lm.predict(test_X))
m.mean_squared_error(test_y,mod_rf.predict(test_X))
# %% -----------------------------------------
# Variable Importance
mod_lm.coef_
# %% -----------------------------------------
= np.array([1,3,5,2,6])
v
np.random.permutation(v)
# Idea: Permute the order of a specific feature to break the statistical
# association with the outcome. If the variable matters, then the model should
# perform WORSE.
# Make a copy (so we don't distort the actual data)
= X.copy()
tmp
# permute a specific column.
0] = np.random.permutation(tmp[:,0])
tmp[:,
= m.mean_squared_error(y,mod_rf.predict(X))
baseline_performance = m.mean_squared_error(y,mod_rf.predict(tmp))
perm_perf
# increase in the MSE
- baseline_performance
perm_perf
# %% -----------------------------------------
# Let's build out a permutation variable importance method.
def permutation_vi(model,X,y,n_permutations = 25):
# what is our baseline performance (i.e. performance before we permute anything)?
= m.mean_squared_error(y,model.predict(X))
baseline_performance
# Build a container (using a dictionary comprehension)
= {f'x_{i}':[] for i in range(X.shape[1])}
store
# Iterate and permute the variables one at a time.
for i, key in enumerate(store.keys()):
= n_permutations
perms = X.copy()
tmp
# permute multiple times.
while perms > 0:
# Permute the order of the variable
= np.random.permutation(tmp[:,i])
tmp[:,i]
# Calculate the error when predicting with the "corrupted" variable
= m.mean_squared_error(y,model.predict(tmp))
perm_perf
# Calculate the change in error
= perm_perf - baseline_performance
change
# Store the change in the error
store[key].append(change)
# tick down
-= 1
perms
# Organize as a data frame
= pd.DataFrame(store)
vi
return vi
= permutation_vi(mod_rf,X,y)
vi
vi
# %% -----------------------------------------
# Plot Variable Importance
(
ggplot(vi.melt(),="variable",y="value")) +
aes(x+
geom_boxplot() +
coord_flip() 0,5)
ylim(
)
# %% -----------------------------------------
# Look at which variables the model relied on using the test data
= permutation_vi(mod_rf,test_X,test_y)
vi2
(
ggplot(vi2.melt(),="variable",y="value")) +
aes(x+
geom_boxplot() +
coord_flip() 0,5)
ylim(
)
# %% -----------------------------------------
# Let's use a completely different class of model, and the
# method still works. This is the model "agnostic" bit.
from sklearn.svm import SVR
= SVR()
mod_svr
mod_svr.fit(X,y)
= permutation_vi(mod_svr,X,y)
vi3
(
ggplot(vi3.melt(),="variable",y="value")) +
aes(x+
geom_boxplot() +
coord_flip() 0,5)
ylim( )
'''
Partial Dependency Plots
'''
import pandas as pd
import numpy as np
# Learning Models
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.linear_model import LinearRegression as LM
import sklearn.metrics as m
from plotnine import *
import warnings
'ignore')
warnings.filterwarnings(
# %% Generate Synthetic Data -----------------------------------------
def gen_data(N=1000,seed=1234):
# Set seed
np.random.seed(seed)
# Generate predictors
= np.random.normal(loc=0,scale=1, size = N)
x1 = np.random.normal(loc=0,scale=1, size = N)
x2 = np.random.normal(loc=0,scale=1, size = N)
x3 = np.random.normal(loc=0,scale=1, size = N) # irreducible error
e
# Generate outcome (which is a function of the predictors -- x3 intentionally left out)
= -2*np.cos(x1)*np.sin(x2) + -1*x1 + x2 + -.75*x2**2 + e
y
# Organize predictors
= np.stack([x1,x2,x3],axis=1)
X
# Return the data
return X,y
# %% -----------------------------------------
# Generate the sample of the data.
= gen_data(N=1000)
X, y
# Plot at the synthetic data
= pd.DataFrame(dict(y=y,x1=X[:,0],x2=X[:,1],x3=X[:,2]))
D
(='y'),
ggplot(D.melt(id_vars="value",y='y')) +
aes(x=.25) +
geom_point(alpha'variable') +
facet_wrap(=(10,5))
theme(figure_size
)
# %% Build Model -----------------------------------------
# Linear model
= LM()
mod_lm
mod_lm.fit(X,y)
# Random Forest model
= RF()
mod_rf
mod_rf.fit(X,y)
# RF > LM
m.mean_squared_error(y,mod_lm.predict(X))
m.mean_squared_error(y,mod_rf.predict(X))
# Performance on future data.
= gen_data(N=1000,seed=2000)
test_X,test_y
m.mean_squared_error(test_y,mod_lm.predict(test_X))
m.mean_squared_error(test_y,mod_rf.predict(test_X))
# %% -----------------------------------------
# The idea
= X.copy()
X_tmp
# What is the average prediction when X1 == 0?
0] = 0
X_tmp[:,
X_tmp= mod_rf.predict(X_tmp)
pred
pred.mean()
# What is the average prediction when X1 == 1?
0] = 1
X_tmp[:,= mod_rf.predict(X_tmp)
pred
pred.mean()
# What is the average prediction when X1 == 2?
0] = 2
X_tmp[:,= mod_rf.predict(X_tmp)
pred
pred.mean()
# %% -----------------------------------------
= X.copy()
X_tmp = 1
target_var = np.arange(-3,3,.25)
possible_values
# Container
= list()
store
for val in possible_values:
# Manipulate the value
= val
X_tmp[:,target_var]
# Average Prediction
= mod_rf.predict(X_tmp).mean()
pred
# Store this
store.append([val,pred])
= pd.DataFrame(store,columns = ['val','pred'])
marginal_effect
# %% -----------------------------------------
(
ggplot(marginal_effect,="val",y="pred")) +
aes(x+
geom_path() =(10,5))
theme(figure_size
)
# %% -----------------------------------------
# Partial Dependency for two variables
= X.copy()
X_tmp = np.arange(-3,3,.25)
possible_values = 0
target_a = 2
target_b
# Container
= list()
store for a in possible_values:
# Manipulate the value
= a
X_tmp[:,target_a]
for b in possible_values:
# Manipulate the value
= b
X_tmp[:,target_b]
# Average Prediction
= mod_rf.predict(X_tmp).mean()
pred
# Store this
store.append([a,b,pred])
= pd.DataFrame(store,columns = ['x0','x1','pred'])
marginal_effect
# %% -----------------------------------------
# Visualize
(
ggplot(marginal_effect,="x0",y="x1",fill="pred")) +
aes(x+
geom_tile() =(7,7))
theme(figure_size )
'''
Individual Conditional Expectations
'''
import pandas as pd
import numpy as np
# Learning Models
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.linear_model import LinearRegression as LM
import sklearn.metrics as m
from plotnine import *
import warnings
'ignore')
warnings.filterwarnings(
# %% Generate Synthetic Data -----------------------------------------
def gen_data(N=1000,seed=1234):
# Set seed
np.random.seed(seed)
# Generate predictors
= np.random.normal(loc=0,scale=1, size = N)
x1 = np.random.normal(loc=0,scale=1, size = N)
x2 = np.random.normal(loc=0,scale=1, size = N)
x3 = np.random.normal(loc=0,scale=1, size = N) # irreducible error
e
# Generate outcome (which is a function of the predictors -- x3 intentionally left out)
= -2*np.cos(x1)*np.sin(x2) - x1 + x2 + -.75*x2**2 + e
y
# Organize predictors
= np.stack([x1,x2,x3],axis=1)
X
# Return the data
return X,y
# %% -----------------------------------------
# Generate the sample of the data.
= gen_data(N=250)
X, y
# Plot at the synthetic data
= pd.DataFrame(dict(y=y,x1=X[:,0],x2=X[:,1],x3=X[:,2]))
D
(='y'),
ggplot(D.melt(id_vars="value",y='y')) +
aes(x=.25) +
geom_point(alpha'variable') +
facet_wrap(=(10,3))
theme(figure_size
)
# %% Build Model -----------------------------------------
# Linear model
= LM()
mod_lm
mod_lm.fit(X,y)
# Random Forest model
= RF()
mod_rf
mod_rf.fit(X,y)
# RF > LM
m.mean_squared_error(y,mod_lm.predict(X))
m.mean_squared_error(y,mod_rf.predict(X))
# Performance on future data.
= gen_data(N=250,seed=2000)
test_X,test_y
m.mean_squared_error(test_y,mod_lm.predict(test_X))
m.mean_squared_error(test_y,mod_rf.predict(test_X))
# %% -----------------------------------------
# Same idea as the PDP, just don't average
= X.copy()
X_tmp = 1
target_var = np.arange(-3,3,.25)
possible_values
= pd.DataFrame()
ice_store for val in possible_values:
# Manipulate the value
= val
X_tmp[:,target_var]
# Predict
= mod_rf.predict(X_tmp)
pred
# Store the individual predictions
= [val for i in range(pred.shape[0])]
vals id = [i for i in range(pred.shape[0])]
= pd.DataFrame(dict(id=id,val=vals,pred=pred))
entry = ice_store.append(entry)
ice_store
ice_store
# %% -----------------------------------------
# Visualize
(=ice_store) +
ggplot(data=aes(x="val",y="pred",group="id"),alpha=.1) +
geom_line(mapping+
theme_bw() = (10,5))
theme(figure_size
)
# %% -----------------------------------------
# PDP on top of the ICE
= (ice_store
pdp "val").pred.mean()
.groupby(
.reset_index()={"pred":'pdp'})
.rename(columns="val",how="right")
.merge(ice_store,on
)
(=pdp) +
ggplot(data=aes(x="val",y="pred",group="id"),alpha=.1) +
geom_line(mapping=aes(x="val",y="pdp"),color="darkred",size=1.5) +
geom_line(mapping+
theme_bw() = (10,5))
theme(figure_size )
The following materials were generated for students enrolled in PPOL564. Please do not distribute without permission.
ed769@georgetown.edu | www.ericdunford.com