Learning Objectives


In the Asynchronous Lecture


In the Synchronous Lecture


If you have any questions while watching the pre-recorded material, be sure to write them down and to bring them up during the synchronous portion of the lecture.




Synchronous Materials





Asynchronous Materials


The following tabs contain pre-recorded lecture materials for class this week. Please review these materials prior to the synchronous lecture.

Total time: Approx. 1 hour and 7 minutes



_




Interpretable Machine Learning

Variable Importance (concept)

Variable Importance (implementation)


Code from the video

'''
Variable Importance
'''

# Data processing 
import pandas as pd
import numpy as np

# Learning Models
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.linear_model import LinearRegression as LM
import sklearn.metrics as m

# Ploting 
from plotnine import *

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')



# %% Generate Synthetic Data -----------------------------------------


def gen_data(N=1000,seed=1234):

    # Set seed
    np.random.seed(seed)

    # Generate predictors
    x1 = np.random.normal(loc=0,scale=1, size = N)
    x2 = np.random.normal(loc=0,scale=1, size = N)
    x3 = np.random.normal(loc=0,scale=1, size = N)
    e = np.random.normal(loc=0,scale=1, size  = N)  # irreducible error

    # Generate outcome (which is a function of the predictors -- x3 intentionally left out)
    y = -2*np.cos(x1)*np.sin(x2) - x1 + x2 + -.75*x2**2 + e

    # Organize predictors
    X = np.stack([x1,x2,x3],axis=1)

    # Return the data
    return X,y



# %% -----------------------------------------

# Generate the sample of the data.
X, y = gen_data(N=1000)

# Plot at the synthetic data
D= pd.DataFrame(dict(y=y,x1=X[:,0],x2=X[:,1],x3=X[:,2]))
(
    ggplot(D.melt(id_vars='y'),
           aes(x="value",y='y')) +
    geom_point(alpha=.25) +
    facet_wrap('variable') +
    theme(figure_size=(10,5))
)



# %% -----------------------------------------

# Build a model

# Linear model 
mod_lm = LM()
mod_lm.fit(X,y)

# Random Forest Model
mod_rf = RF()
mod_rf.fit(X,y)


# Performance: RF > LM
m.mean_squared_error(y,mod_lm.predict(X))
m.mean_squared_error(y,mod_rf.predict(X))

# Performance on future data. 
 test_X, test_y = gen_data(N=1000,seed=2000)


m.mean_squared_error(test_y,mod_lm.predict(test_X))
m.mean_squared_error(test_y,mod_rf.predict(test_X))



# %% -----------------------------------------

# Variable Importance 

mod_lm.coef_


# %% -----------------------------------------

v = np.array([1,3,5,2,6])
np.random.permutation(v)

# Idea: Permute the order of a specific feature to break the statistical
# association with the outcome. If the variable matters, then the model should
# perform WORSE.

# Make a copy (so we don't distort the actual data)
tmp = X.copy()

# permute a specific column.
tmp[:,0] = np.random.permutation(tmp[:,0])

baseline_performance = m.mean_squared_error(y,mod_rf.predict(X))
perm_perf = m.mean_squared_error(y,mod_rf.predict(tmp))

# increase in the MSE
perm_perf - baseline_performance


# %% -----------------------------------------

# Let's build out a permutation variable importance method. 

def permutation_vi(model,X,y,n_permutations = 25):

    # what is our baseline performance (i.e. performance before we permute anything)?
    baseline_performance = m.mean_squared_error(y,model.predict(X))

    # Build a container (using a dictionary comprehension)
    store = {f'x_{i}':[] for i in range(X.shape[1])}

    # Iterate and permute the variables one at a time.
    for i, key in enumerate(store.keys()):

        perms = n_permutations
        tmp = X.copy()

        # permute multiple times. 
        while perms > 0:

            # Permute the order of the variable 
            tmp[:,i] = np.random.permutation(tmp[:,i])

            # Calculate the error when predicting with the "corrupted" variable 
            perm_perf = m.mean_squared_error(y,model.predict(tmp)) 

            # Calculate the change in error
            change = perm_perf - baseline_performance

            # Store the change in the error 
            store[key].append(change)

            # tick down
            perms -= 1

    # Organize as a data frame
    vi = pd.DataFrame(store)

    return vi


vi = permutation_vi(mod_rf,X,y)

vi

# %% -----------------------------------------


# Plot Variable Importance 

(
    ggplot(vi.melt(),
           aes(x="variable",y="value")) +
    geom_boxplot() +
    coord_flip() +
    ylim(0,5)
)

# %% -----------------------------------------

# Look at which variables the model relied on using the test data
vi2 = permutation_vi(mod_rf,test_X,test_y)

(
    ggplot(vi2.melt(),
           aes(x="variable",y="value")) +
    geom_boxplot() +
    coord_flip() +
    ylim(0,5)
)

# %% -----------------------------------------

# Let's use a completely different class of model, and the
# method still works. This is the model "agnostic" bit. 

from sklearn.svm import SVR
mod_svr = SVR()
mod_svr.fit(X,y)

vi3 = permutation_vi(mod_svr,X,y)


(
    ggplot(vi3.melt(),
           aes(x="variable",y="value")) +
    geom_boxplot() +
    coord_flip() +
    ylim(0,5)
)



Partial Dependencies

Code from the video

'''
Partial Dependency Plots
'''

import pandas as pd
import numpy as np

# Learning Models
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.linear_model import LinearRegression as LM
import sklearn.metrics as m

from plotnine import *

import warnings
warnings.filterwarnings('ignore')

# %% Generate Synthetic Data -----------------------------------------

def gen_data(N=1000,seed=1234):

    # Set seed
    np.random.seed(seed)

    # Generate predictors
    x1 = np.random.normal(loc=0,scale=1, size = N)
    x2 = np.random.normal(loc=0,scale=1, size = N)
    x3 = np.random.normal(loc=0,scale=1, size = N)
    e = np.random.normal(loc=0,scale=1, size  = N)  # irreducible error

    # Generate outcome (which is a function of the predictors -- x3 intentionally left out)
    y = -2*np.cos(x1)*np.sin(x2)  + -1*x1 + x2 + -.75*x2**2 + e

    # Organize predictors
    X = np.stack([x1,x2,x3],axis=1)

    # Return the data
    return X,y


# %% -----------------------------------------

# Generate the sample of the data.
X, y = gen_data(N=1000)

# Plot at the synthetic data
D= pd.DataFrame(dict(y=y,x1=X[:,0],x2=X[:,1],x3=X[:,2]))
(
    ggplot(D.melt(id_vars='y'),
           aes(x="value",y='y')) +
    geom_point(alpha=.25) +
    facet_wrap('variable') +
    theme(figure_size=(10,5))
)



# %% Build Model -----------------------------------------


# Linear model
mod_lm = LM()
mod_lm.fit(X,y)


# Random Forest model
mod_rf = RF()
mod_rf.fit(X,y)

# RF > LM
m.mean_squared_error(y,mod_lm.predict(X))
m.mean_squared_error(y,mod_rf.predict(X))

# Performance on future data.
test_X,test_y = gen_data(N=1000,seed=2000)

m.mean_squared_error(test_y,mod_lm.predict(test_X))
m.mean_squared_error(test_y,mod_rf.predict(test_X))

# %% -----------------------------------------

# The idea 

X_tmp = X.copy()


# What is the average prediction when X1 == 0?
X_tmp[:,0] = 0
X_tmp
pred = mod_rf.predict(X_tmp)
pred.mean()


# What is the average prediction when X1 == 1?
X_tmp[:,0] = 1
pred = mod_rf.predict(X_tmp)
pred.mean()


# What is the average prediction when X1 == 2?
X_tmp[:,0] = 2
pred = mod_rf.predict(X_tmp)
pred.mean()

# %% -----------------------------------------

X_tmp = X.copy()
target_var = 1
possible_values = np.arange(-3,3,.25)

# Container 
store = list()

for val in possible_values:

    # Manipulate the value 
    X_tmp[:,target_var] = val

    # Average Prediction
    pred = mod_rf.predict(X_tmp).mean()

    # Store this 
    store.append([val,pred])

marginal_effect = pd.DataFrame(store,columns = ['val','pred'])

# %% -----------------------------------------

(
    ggplot(marginal_effect,
           aes(x="val",y="pred")) +
    geom_path() +
    theme(figure_size=(10,5))
)


# %% -----------------------------------------

# Partial Dependency for two variables 

X_tmp = X.copy()
possible_values = np.arange(-3,3,.25)
target_a = 0
target_b = 2

# Container 
store = list()
for a in possible_values:

    # Manipulate the value 
    X_tmp[:,target_a] = a

    for b in possible_values:

        # Manipulate the value 
        X_tmp[:,target_b] = b

        # Average Prediction
        pred = mod_rf.predict(X_tmp).mean()

        # Store this 
        store.append([a,b,pred])


marginal_effect = pd.DataFrame(store,columns = ['x0','x1','pred'])

# %% -----------------------------------------

# Visualize 

(
    ggplot(marginal_effect,
           aes(x="x0",y="x1",fill="pred")) +
    geom_tile() +
    theme(figure_size=(7,7))
)



Individual Conditional Expectations

Code from the video

'''
Individual Conditional Expectations
'''

import pandas as pd
import numpy as np

# Learning Models
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.linear_model import LinearRegression as LM
import sklearn.metrics as m

from plotnine import *

import warnings
warnings.filterwarnings('ignore')


# %% Generate Synthetic Data -----------------------------------------

def gen_data(N=1000,seed=1234):

    # Set seed
    np.random.seed(seed)

    # Generate predictors
    x1 = np.random.normal(loc=0,scale=1, size = N)
    x2 = np.random.normal(loc=0,scale=1, size = N)
    x3 = np.random.normal(loc=0,scale=1, size = N)
    e = np.random.normal(loc=0,scale=1, size  = N)  # irreducible error

    # Generate outcome (which is a function of the predictors -- x3 intentionally left out)
    y = -2*np.cos(x1)*np.sin(x2) - x1 + x2 + -.75*x2**2 + e

    # Organize predictors
    X = np.stack([x1,x2,x3],axis=1)

    # Return the data
    return X,y


# %% -----------------------------------------

# Generate the sample of the data.
X, y = gen_data(N=250)

# Plot at the synthetic data
D= pd.DataFrame(dict(y=y,x1=X[:,0],x2=X[:,1],x3=X[:,2]))
(
    ggplot(D.melt(id_vars='y'),
           aes(x="value",y='y')) +
    geom_point(alpha=.25) +
    facet_wrap('variable') +
    theme(figure_size=(10,3))
)



# %% Build Model -----------------------------------------


# Linear model
mod_lm = LM()
mod_lm.fit(X,y)


# Random Forest model
mod_rf = RF()
mod_rf.fit(X,y)

# RF > LM
m.mean_squared_error(y,mod_lm.predict(X))
m.mean_squared_error(y,mod_rf.predict(X))

# Performance on future data.
test_X,test_y = gen_data(N=250,seed=2000)

m.mean_squared_error(test_y,mod_lm.predict(test_X))
m.mean_squared_error(test_y,mod_rf.predict(test_X))


# %% -----------------------------------------

# Same idea as the PDP, just don't average

X_tmp = X.copy()
target_var = 1
possible_values = np.arange(-3,3,.25)

ice_store = pd.DataFrame()
for val in possible_values:

    # Manipulate the value
    X_tmp[:,target_var] = val

    # Predict
    pred = mod_rf.predict(X_tmp)

    # Store the individual predictions
    vals = [val for i in range(pred.shape[0])]
    id = [i for i in range(pred.shape[0])]
    entry = pd.DataFrame(dict(id=id,val=vals,pred=pred))
    ice_store = ice_store.append(entry)

ice_store

# %% -----------------------------------------

# Visualize
(
    ggplot(data=ice_store) +
    geom_line(mapping=aes(x="val",y="pred",group="id"),alpha=.1) +
    theme_bw() +
    theme(figure_size = (10,5))
)



# %% -----------------------------------------

# PDP on top of the ICE

pdp = (ice_store
       .groupby("val").pred.mean()
       .reset_index()
       .rename(columns={"pred":'pdp'})
       .merge(ice_store,on="val",how="right")
       )


(
    ggplot(data=pdp) +
    geom_line(mapping=aes(x="val",y="pred",group="id"),alpha=.1) +
    geom_line(mapping=aes(x="val",y="pdp"),color="darkred",size=1.5) +
    theme_bw() +
    theme(figure_size = (10,5))
)



 

The following materials were generated for students enrolled in PPOL564. Please do not distribute without permission.

ed769@georgetown.edu | www.ericdunford.com