import numpy as np
import pandas as pd
import sympy as sp
import requests
from bokeh.plotting import figure, output_notebook, show
from bokeh.layouts import column, row, gridplot
output_notebook()
sp.init_printing(use_unicode=True)
Download Data from Github
def download_data(git_loc,dest_name):
'''
Download data from Github and save to the notebook's working directory.
'''
req = requests.get(git_loc)
with open(dest_name,"w") as file:
for line in req.text:
file.writelines(line)
download_data('https://raw.githubusercontent.com/edunford/ppol564/master/lectures/lecture_22/test_data.csv',
"test_data.csv")
download_data('https://raw.githubusercontent.com/edunford/ppol564/master/lectures/lecture_22/training_data.csv',
"training_data.csv")
Many social science problems involve classifying states of the world (war/peace, voting, event occurrences, etc.). The Bernoulli Distribution (which is a special case of the binomial distribution) describes the distribution of two states (heads/tails). We'll describe the outcome in terms of 1 or 0 to describe the two states. $y \in [0,1]$
The Bernoulli Distribution is described as
$$ p^y(1-p)^{1-y} $$
Odds of something happening over not happening.
Our goal is to model that probability. We want to figure out the weights to maximize the probability of observing a 1 over a 0. Again, we can state the problem as minimizing the error, which maximizes the likelihood of observing the data.
How can we model a probability?
Probabilities are unique. They are bounded between 0 and 1. Thus, we need a function that obeys such bounds. Luckily we have a range of functional forms to choose from. A function that maps a continuous value (a line) onto a probability distribution is known as a "link function". We'll look at a specific kind of link function: the logistic or a sigmoid function.
Let's define the logistic function $\nu$ (sigmoid function) as
$$ \nu(z) = \frac{1}{1+e^{-z}} $$
$$ z = X\beta $$
Note that $z$ is our normal linear projection. $z(\cdot)$ maps our line to a probability space.
# Defining the logistic function...
def s(z):
'''sigmoid function'''
return 1/(1+np.exp(-z))
def ds(z):
'''first-order derivative of the function'''
return np.exp(-x)/(1 + np.exp(-x))**2
Let's get a feel for it's behavior and its first derivative.
x = np.arange(-7,7,.1) # Generate a numerical range
p = figure(height=300,width=700)
p.line(x,s(x),line_width=3,legend='logistic',alpha=.5)
p.line(x,ds(x),line_width=3,color="orange",
alpha=.5,legend="df/dx logistic")
p.legend.location = "top_left"
show(p)
With this function, we can build a model for the probability.
$$ pr(y = 1) = \nu(z) $$
$$ \nu(z)^y(1-\nu(z))^{1-y} $$
D = pd.read_csv("training_data.csv")
D.head()
# Outcome
y = D.y
# Generate design matrix.
X = np.column_stack([np.ones(D.shape[0]),D.x1,D.x2])
X[:5,:]
The likelihood function (each individual likelihood multiplied together) is
$$ L = \prod_{i=1}^N \nu(z_{i})^{y_{i}}(1-\nu(z_i))^{1-y_i} $$
The log likelihood function is
$$ l = \sum_{i=1}^N y_i log(\nu(z_i)) + (1-y_i)) log((1-\nu(z_i)))$$
When we minimize the average log likelihood function, we get our cost function...
$$ C = - \frac{1}{N} \sum_{i=1}^N y_i log(\nu(z_i)) + (1-y_i) log((1-\nu(z_i)))$$
Let's get a feel the for this cost function behaves...
outcome = 1
pr = np.arange(.001,1,.001)
p = figure(height=400,width=400,title="y = 1")
p.line(pr,
-outcome*np.log(pr),
line_width=3,alpha=.5,
legend='-y*log(pr)')
p.line(pr,
-(1-outcome)*np.log(1-pr),
line_width=3,alpha=.5,
color = 'orange',
legend='(1-y)*log(1-pr)')
p.xaxis.axis_label = "Est. Probability"
p.yaxis.axis_label = "Cost"
outcome = 0
p2 = figure(height=400,width=400,title="y = 0")
p2.line(pr,
-outcome*np.log(pr),
line_width=3,alpha=.5,
legend='-y*log(pr)')
p2.line(pr,
-(1-outcome)*np.log(1-pr),
line_width=3,alpha=.5,
color = 'orange',
legend='(1-y)*log(1-pr)')
p2.xaxis.axis_label = "Est. Probability"
p2.yaxis.axis_label = "Cost"
p2.legend.location = "top_left"
show(row(p,p2))
Let's write a function to quickly compute the costs, given some randomly selected coefficients...
def cost(y,X,B):
'''
Cross-Entropy Cost
'''
z = X.dot(B)
n = len(y)
return -sum(y*np.log(s(z))+(1-y)*np.log(1-s(z)))/n
cost(y,X,B=np.array([0,0,0]))
Let's locate the gradient of the cost function with respect to our parameters/"weights", $\beta$.
b0,b1,x,y_sym = sp.symbols('b0 b1 x y')
z_sym = b0 + b1*x
s_sym = 1/(1+sp.exp(-z_sym))
c = -y_sym*sp.log(s_sym)-(1-y_sym)*sp.log(1-s_sym)
c
Taking the derivative with respect to $\beta_0$
c.diff(b0).simplify()
$$ \frac{\partial C}{\partial \beta_0}= \frac{- y e^{- b_{0} - b_{1} x} - y + 1}{e^{- b_{0} - b_{1} x} + 1} $$
$$ \frac{-y(e^{- b_{0} - b_{1} x}+1)+1}{e^{- b_{0} - b_{1} x} + 1}$$
$$ \frac{-y(e^{- b_{0} - b_{1} x}+1)}{e^{- b_{0} - b_{1} x} + 1} + \frac{1}{e^{- b_{0} - b_{1} x} + 1}$$
$$ \frac{-y(e^{- b_{0} - b_{1} x}+1)}{e^{- b_{0} - b_{1} x} + 1} + \nu(z)$$
$$ \frac{\partial C}{\partial \beta_0}= \nu(z) -y$$
Now, the derivative with respect to $\beta_1$
c.diff(b1).simplify()
$$\frac{\partial C}{\partial \beta_1}= \frac{x(- y e^{- b_{0} - b_{1} x} - y + 1)}{e^{- b_{0} - b_{1} x} + 1} $$
$$ x(\frac{-y(e^{- b_{0} - b_{1} x}+1)+1}{e^{- b_{0} - b_{1} x} + 1})$$
$$ x(\frac{-y(e^{- b_{0} - b_{1} x}+1)}{e^{- b_{0} - b_{1} x} + 1} + \frac{1}{e^{- b_{0} - b_{1} x} + 1})$$
$$ \frac{\partial C}{\partial \beta_1}= x(\nu(z)-y)$$
We now have everything we need to calculate the gradient of the cost function, and we can see that this cost function actually scale quite beautifully to a large number of inputs.
def calc_gradient(y,X,B):
'''
Calculate and average the gradient for every entry in the data.
'''
z = X.dot(B)
n = len(y)
grad = [sum(s(z)-y)/n ]
for i in range(1,X.shape[1]):
grad.append(sum(X[:,i]*(s(z)-y))/n)
return np.array(grad)
Let's get a feel for both functions again.
B = np.array([1,1,1])
print(cost(y,X,B))
calc_gradient(y,X,B)
Now run the algorithm a number of times.
np.random.seed(123)
B = np.random.uniform(-1,1,3) # starting values
learning = 1
n_iter = 1000
stats = []
for i in range(n_iter):
grad = calc_gradient(y,X,B)
B = B - learning*grad
# Store output values...
stats.append(np.hstack([cost(y,X,B),B]))
stats = np.array(stats)
# Let's print out the resulting coefficients ("weights")
print(B)
Let's visualize the gradient descent step...
p = figure(height=400,width=400)
p.xaxis.axis_label = "Iterations"
p.yaxis.axis_label = "Cross-Entropy"
p.line(np.arange(stats.shape[0]),stats[:,0],line_width=3)
p2 = figure(height=400,width=400)
p2.xaxis.axis_label = "Iterations"
p2.yaxis.axis_label = "Coeffients"
p2.line(np.arange(stats.shape[0]),stats[:,1],color="red",line_width=3,legend="B0")
p2.line(np.arange(stats.shape[0]),stats[:,2],color="orange",line_width=3,legend="B1")
p2.line(np.arange(stats.shape[0]),stats[:,3],color="pink",line_width=3,legend="B2")
show(row(p,p2))
import statsmodels.discrete.discrete_model as sm
sm.Logit(y,X).fit().summary()
D_test = pd.read_csv("test_data.csv")
# Outcome
y_test = D_test.y
# Generate design matrix.
X_test = np.column_stack([np.ones(D_test.shape[0]),D_test.x1,D_test.x2])
X_test[:5,:]
First, note that our coefficients weight our data in a way that minimizes the costs and maximizes the likelihood. We can used these weights to predict new data entries, which are contained in our test dataset.
probs = s(X_test.dot(B))
probs[:10].round(2)
This will give us probabilities that we can then convert into real values by choosing an arbitrary threshold.
prediction = probs >= .5
prediction = prediction.astype("int")
prediction[:10]
# Accuracy: How many ones did we predict that were actual ones?
sum(prediction == y_test)/len(y_test)