import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as sm
import statsmodels.discrete.discrete_model as smd
import warnings
import requests
warnings.filterwarnings("ignore")
plt.style.use('ggplot')
np.random.seed(5)
N = 1000
x1 = np.random.normal(size=N)
x2 = np.random.normal(size=N)
error = np.random.normal(size=N)
y = 1 + .5*x1 + -2*x2 + 3*x1*x2 + error
D = pd.DataFrame(dict(y=y,x1 = x1, x2 = x2))
D.head()
sm.ols('y ~ x1 + x2 + x1*x2', data=D).fit().summary()
Let's simulate the data many times, retaining the the coefficients on each iteration
coefs = []
sims = 1000
N = 100
b = [1,.5,-2,3]
for i in range(sims):
x1 = np.random.normal(size=N)
x2 = np.random.normal(size=N)
error = np.random.normal(size=N)
y = b[0] + b[1]*x1 + b[2]*x2 + b[3]*x1*x2 + error
D = pd.DataFrame(dict(y=y,x1 = x1, x2 = x2))
mod = sm.ols('y ~ x1 + x2 + x1*x2', data=D).fit()
coefs.append(mod.params.tolist())
# Convert to a array
coefs = np.array(coefs)
fig, axs = plt.subplots(4,1, sharex=True, sharey=True,figsize=(10,8))
sns.distplot(coefs[:,0],kde=False,ax=axs[0])
sns.distplot(coefs[:,1],kde=False,ax=axs[1])
sns.distplot(coefs[:,2],kde=False,ax=axs[2])
sns.distplot(coefs[:,3],kde=False,ax=axs[3])
# Plot the position of the "true" coefficient values.
axs[0].axvline(b[0],color="darkblue")
axs[1].axvline(b[1],color="darkblue")
axs[2].axvline(b[2],color="darkblue")
axs[3].axvline(b[3],color="darkblue")
# Plot the titles for each subplot
axs[0].title.set_text('Intercept')
axs[1].title.set_text('x1')
axs[2].title.set_text('x2')
axs[3].title.set_text('x1*x2')
fig.show()
Let's now simulate the impact of Omitted variable bias. Let's say there is some unobserved variable $z$ that is correlated with $x_1$ and $x_2$. How would that bias our estimates?
In the below code everything is the same except that we're adding variable $z$ that is correlated with the outcomes and the two predictors.
coefs = []
sims = 1000
N = 100
b = [1,.5,-2,3]
for i in range(sims):
z = np.random.normal(1,3,size=N)
x1 = np.random.normal(size=N) + z
x2 = np.random.normal(size=N) - z
error = np.random.normal(size=N)
y = b[0] + b[1]*x1 + b[2]*x2 + b[3]*x1*x2 + 5*z + error
D = pd.DataFrame(dict(y=y,x1 = x1, x2 = x2))
mod = sm.ols('y ~ x1 + x2 + x1*x2', data=D).fit()
coefs.append(mod.params.tolist())
# Convert to a array
coefs = np.array(coefs)
As the plots below show, $z$ is a confounding variable. By not including it in the model, we are biasing our estimates of $\beta_1$ and $\beta_2$
fig, axs = plt.subplots(4,1, sharex=True, sharey=True,figsize=(10,8))
sns.distplot(coefs[:,0],kde=False,ax=axs[0])
sns.distplot(coefs[:,1],kde=False,ax=axs[1])
sns.distplot(coefs[:,2],kde=False,ax=axs[2])
sns.distplot(coefs[:,3],kde=False,ax=axs[3])
# Plot the position of the "true" coefficient values.
axs[0].axvline(b[0],color="darkblue")
axs[1].axvline(b[1],color="darkblue")
axs[2].axvline(b[2],color="darkblue")
axs[3].axvline(b[3],color="darkblue")
# Plot the titles for each subplot
axs[0].title.set_text('Intercept')
axs[1].title.set_text('x1')
axs[2].title.set_text('x2')
axs[3].title.set_text('x1*x2')
fig.show()
np.random.seed(12345)
N = 1000 # Number of observations
# Linear combination of predictors
x1 = np.random.normal(size=N) # One continuous predictor
x2 = np.random.binomial(1,p=.2,size=N) # One binary predictor
X = np.column_stack([np.ones(N),x1,x2]) # Compose as a matrix
B = np.array([.5,.7,-2]) # True Coefficients
mu = X.dot(B) # average prob is a function of x1 and x2
# Convert our linear combination into a probability space
pr = st.norm.cdf(mu,0,1)
# Convert into binary outcome using the Bernoulli distribution
y = np.random.binomial(n=1,p = pr)
# Visualize
sns.pairplot(pd.DataFrame(dict(y=y,x1=x1,x2=x2)),height=3)
plt.show()
smd.Probit(y,X).fit().summary()
np.random.seed(12345)
N = 1000 # Number of observations
# Linear combination of predictors
x1 = np.random.normal(size=N) # One continuous predictor
x2 = np.random.binomial(1,p=.2,size=N) # One binary predictor
X = np.column_stack([np.ones(N),x1,x2]) # Compose as a matrix
B = np.array([.5,.7,-2]) # True Coefficients
mu = X.dot(B) # average rate is a function of x1 and x2
# Convert our linear combination into a "rate" space (i.e. positively defined)
lamb = np.exp(mu)
# Convert into binary outcome using the Bernoulli distribution
y = np.random.poisson(lam=lamb)
# Visualize
sns.pairplot(pd.DataFrame(dict(y=y,x1=x1,x2=x2)),height=3)
plt.show()
smd.Poisson(y,X).fit().summary()
"An agent-based model (ABM) is a class of computational models for simulating the actions and interactions of autonomous agents (both individual or collective entities such as organizations or groups) with a view to assessing their effects on the system as a whole. It combines elements of game theory, complex systems, emergence, computational sociology, multi-agent systems, and evolutionary programming. Monte Carlo methods are used to introduce randomness." (Wiki)
A walkthrough regarding how to construct Schelling's model in Python: