In the Asynchronous Lecture
In the Synchronous Lecture
sklearn framework (plus touch on global surrogate models).If you have any questions while watching the pre-recorded material, be sure to write them down and to bring them up during the synchronous portion of the lecture.
.ipynb Notebook.health-coverage.csv .The following tabs contain pre-recorded lecture materials for class this week. Please review these materials prior to the synchronous lecture.
Total time: Approx. 1 hour and 7 minutes
'''
Variable Importance
'''
# Data processing
import pandas as pd
import numpy as np
# Learning Models
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.linear_model import LinearRegression as LM
import sklearn.metrics as m
# Ploting
from plotnine import *
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')
# %% Generate Synthetic Data -----------------------------------------
def gen_data(N=1000,seed=1234):
# Set seed
np.random.seed(seed)
# Generate predictors
x1 = np.random.normal(loc=0,scale=1, size = N)
x2 = np.random.normal(loc=0,scale=1, size = N)
x3 = np.random.normal(loc=0,scale=1, size = N)
e = np.random.normal(loc=0,scale=1, size = N) # irreducible error
# Generate outcome (which is a function of the predictors -- x3 intentionally left out)
y = -2*np.cos(x1)*np.sin(x2) - x1 + x2 + -.75*x2**2 + e
# Organize predictors
X = np.stack([x1,x2,x3],axis=1)
# Return the data
return X,y
# %% -----------------------------------------
# Generate the sample of the data.
X, y = gen_data(N=1000)
# Plot at the synthetic data
D= pd.DataFrame(dict(y=y,x1=X[:,0],x2=X[:,1],x3=X[:,2]))
(
ggplot(D.melt(id_vars='y'),
aes(x="value",y='y')) +
geom_point(alpha=.25) +
facet_wrap('variable') +
theme(figure_size=(10,5))
)
# %% -----------------------------------------
# Build a model
# Linear model
mod_lm = LM()
mod_lm.fit(X,y)
# Random Forest Model
mod_rf = RF()
mod_rf.fit(X,y)
# Performance: RF > LM
m.mean_squared_error(y,mod_lm.predict(X))
m.mean_squared_error(y,mod_rf.predict(X))
# Performance on future data.
test_X, test_y = gen_data(N=1000,seed=2000)
m.mean_squared_error(test_y,mod_lm.predict(test_X))
m.mean_squared_error(test_y,mod_rf.predict(test_X))
# %% -----------------------------------------
# Variable Importance
mod_lm.coef_
# %% -----------------------------------------
v = np.array([1,3,5,2,6])
np.random.permutation(v)
# Idea: Permute the order of a specific feature to break the statistical
# association with the outcome. If the variable matters, then the model should
# perform WORSE.
# Make a copy (so we don't distort the actual data)
tmp = X.copy()
# permute a specific column.
tmp[:,0] = np.random.permutation(tmp[:,0])
baseline_performance = m.mean_squared_error(y,mod_rf.predict(X))
perm_perf = m.mean_squared_error(y,mod_rf.predict(tmp))
# increase in the MSE
perm_perf - baseline_performance
# %% -----------------------------------------
# Let's build out a permutation variable importance method.
def permutation_vi(model,X,y,n_permutations = 25):
# what is our baseline performance (i.e. performance before we permute anything)?
baseline_performance = m.mean_squared_error(y,model.predict(X))
# Build a container (using a dictionary comprehension)
store = {f'x_{i}':[] for i in range(X.shape[1])}
# Iterate and permute the variables one at a time.
for i, key in enumerate(store.keys()):
perms = n_permutations
tmp = X.copy()
# permute multiple times.
while perms > 0:
# Permute the order of the variable
tmp[:,i] = np.random.permutation(tmp[:,i])
# Calculate the error when predicting with the "corrupted" variable
perm_perf = m.mean_squared_error(y,model.predict(tmp))
# Calculate the change in error
change = perm_perf - baseline_performance
# Store the change in the error
store[key].append(change)
# tick down
perms -= 1
# Organize as a data frame
vi = pd.DataFrame(store)
return vi
vi = permutation_vi(mod_rf,X,y)
vi
# %% -----------------------------------------
# Plot Variable Importance
(
ggplot(vi.melt(),
aes(x="variable",y="value")) +
geom_boxplot() +
coord_flip() +
ylim(0,5)
)
# %% -----------------------------------------
# Look at which variables the model relied on using the test data
vi2 = permutation_vi(mod_rf,test_X,test_y)
(
ggplot(vi2.melt(),
aes(x="variable",y="value")) +
geom_boxplot() +
coord_flip() +
ylim(0,5)
)
# %% -----------------------------------------
# Let's use a completely different class of model, and the
# method still works. This is the model "agnostic" bit.
from sklearn.svm import SVR
mod_svr = SVR()
mod_svr.fit(X,y)
vi3 = permutation_vi(mod_svr,X,y)
(
ggplot(vi3.melt(),
aes(x="variable",y="value")) +
geom_boxplot() +
coord_flip() +
ylim(0,5)
)'''
Partial Dependency Plots
'''
import pandas as pd
import numpy as np
# Learning Models
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.linear_model import LinearRegression as LM
import sklearn.metrics as m
from plotnine import *
import warnings
warnings.filterwarnings('ignore')
# %% Generate Synthetic Data -----------------------------------------
def gen_data(N=1000,seed=1234):
# Set seed
np.random.seed(seed)
# Generate predictors
x1 = np.random.normal(loc=0,scale=1, size = N)
x2 = np.random.normal(loc=0,scale=1, size = N)
x3 = np.random.normal(loc=0,scale=1, size = N)
e = np.random.normal(loc=0,scale=1, size = N) # irreducible error
# Generate outcome (which is a function of the predictors -- x3 intentionally left out)
y = -2*np.cos(x1)*np.sin(x2) + -1*x1 + x2 + -.75*x2**2 + e
# Organize predictors
X = np.stack([x1,x2,x3],axis=1)
# Return the data
return X,y
# %% -----------------------------------------
# Generate the sample of the data.
X, y = gen_data(N=1000)
# Plot at the synthetic data
D= pd.DataFrame(dict(y=y,x1=X[:,0],x2=X[:,1],x3=X[:,2]))
(
ggplot(D.melt(id_vars='y'),
aes(x="value",y='y')) +
geom_point(alpha=.25) +
facet_wrap('variable') +
theme(figure_size=(10,5))
)
# %% Build Model -----------------------------------------
# Linear model
mod_lm = LM()
mod_lm.fit(X,y)
# Random Forest model
mod_rf = RF()
mod_rf.fit(X,y)
# RF > LM
m.mean_squared_error(y,mod_lm.predict(X))
m.mean_squared_error(y,mod_rf.predict(X))
# Performance on future data.
test_X,test_y = gen_data(N=1000,seed=2000)
m.mean_squared_error(test_y,mod_lm.predict(test_X))
m.mean_squared_error(test_y,mod_rf.predict(test_X))
# %% -----------------------------------------
# The idea
X_tmp = X.copy()
# What is the average prediction when X1 == 0?
X_tmp[:,0] = 0
X_tmp
pred = mod_rf.predict(X_tmp)
pred.mean()
# What is the average prediction when X1 == 1?
X_tmp[:,0] = 1
pred = mod_rf.predict(X_tmp)
pred.mean()
# What is the average prediction when X1 == 2?
X_tmp[:,0] = 2
pred = mod_rf.predict(X_tmp)
pred.mean()
# %% -----------------------------------------
X_tmp = X.copy()
target_var = 1
possible_values = np.arange(-3,3,.25)
# Container
store = list()
for val in possible_values:
# Manipulate the value
X_tmp[:,target_var] = val
# Average Prediction
pred = mod_rf.predict(X_tmp).mean()
# Store this
store.append([val,pred])
marginal_effect = pd.DataFrame(store,columns = ['val','pred'])
# %% -----------------------------------------
(
ggplot(marginal_effect,
aes(x="val",y="pred")) +
geom_path() +
theme(figure_size=(10,5))
)
# %% -----------------------------------------
# Partial Dependency for two variables
X_tmp = X.copy()
possible_values = np.arange(-3,3,.25)
target_a = 0
target_b = 2
# Container
store = list()
for a in possible_values:
# Manipulate the value
X_tmp[:,target_a] = a
for b in possible_values:
# Manipulate the value
X_tmp[:,target_b] = b
# Average Prediction
pred = mod_rf.predict(X_tmp).mean()
# Store this
store.append([a,b,pred])
marginal_effect = pd.DataFrame(store,columns = ['x0','x1','pred'])
# %% -----------------------------------------
# Visualize
(
ggplot(marginal_effect,
aes(x="x0",y="x1",fill="pred")) +
geom_tile() +
theme(figure_size=(7,7))
)'''
Individual Conditional Expectations
'''
import pandas as pd
import numpy as np
# Learning Models
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.linear_model import LinearRegression as LM
import sklearn.metrics as m
from plotnine import *
import warnings
warnings.filterwarnings('ignore')
# %% Generate Synthetic Data -----------------------------------------
def gen_data(N=1000,seed=1234):
# Set seed
np.random.seed(seed)
# Generate predictors
x1 = np.random.normal(loc=0,scale=1, size = N)
x2 = np.random.normal(loc=0,scale=1, size = N)
x3 = np.random.normal(loc=0,scale=1, size = N)
e = np.random.normal(loc=0,scale=1, size = N) # irreducible error
# Generate outcome (which is a function of the predictors -- x3 intentionally left out)
y = -2*np.cos(x1)*np.sin(x2) - x1 + x2 + -.75*x2**2 + e
# Organize predictors
X = np.stack([x1,x2,x3],axis=1)
# Return the data
return X,y
# %% -----------------------------------------
# Generate the sample of the data.
X, y = gen_data(N=250)
# Plot at the synthetic data
D= pd.DataFrame(dict(y=y,x1=X[:,0],x2=X[:,1],x3=X[:,2]))
(
ggplot(D.melt(id_vars='y'),
aes(x="value",y='y')) +
geom_point(alpha=.25) +
facet_wrap('variable') +
theme(figure_size=(10,3))
)
# %% Build Model -----------------------------------------
# Linear model
mod_lm = LM()
mod_lm.fit(X,y)
# Random Forest model
mod_rf = RF()
mod_rf.fit(X,y)
# RF > LM
m.mean_squared_error(y,mod_lm.predict(X))
m.mean_squared_error(y,mod_rf.predict(X))
# Performance on future data.
test_X,test_y = gen_data(N=250,seed=2000)
m.mean_squared_error(test_y,mod_lm.predict(test_X))
m.mean_squared_error(test_y,mod_rf.predict(test_X))
# %% -----------------------------------------
# Same idea as the PDP, just don't average
X_tmp = X.copy()
target_var = 1
possible_values = np.arange(-3,3,.25)
ice_store = pd.DataFrame()
for val in possible_values:
# Manipulate the value
X_tmp[:,target_var] = val
# Predict
pred = mod_rf.predict(X_tmp)
# Store the individual predictions
vals = [val for i in range(pred.shape[0])]
id = [i for i in range(pred.shape[0])]
entry = pd.DataFrame(dict(id=id,val=vals,pred=pred))
ice_store = ice_store.append(entry)
ice_store
# %% -----------------------------------------
# Visualize
(
ggplot(data=ice_store) +
geom_line(mapping=aes(x="val",y="pred",group="id"),alpha=.1) +
theme_bw() +
theme(figure_size = (10,5))
)
# %% -----------------------------------------
# PDP on top of the ICE
pdp = (ice_store
.groupby("val").pred.mean()
.reset_index()
.rename(columns={"pred":'pdp'})
.merge(ice_store,on="val",how="right")
)
(
ggplot(data=pdp) +
geom_line(mapping=aes(x="val",y="pred",group="id"),alpha=.1) +
geom_line(mapping=aes(x="val",y="pdp"),color="darkred",size=1.5) +
theme_bw() +
theme(figure_size = (10,5))
)The following materials were generated for students enrolled in PPOL564. Please do not distribute without permission.
ed769@georgetown.edu | www.ericdunford.com