import numpy as np
import sympy as sp
import statsmodels.formula.api as smf
import pandas as pd
from bokeh.plotting import figure, output_notebook, show
from bokeh.layouts import column, row, gridplot
output_notebook()
sp.init_printing(use_unicode=True)
# Consider the following function...
z = sp.symbols('z')
sym_f = (z**2+1) + -sp.exp(z) - 20*sp.sin(z)
f = sp.lambdify(z,sym_f,'numpy')
df = sp.lambdify(z,sym_f.diff(),'numpy')
ddf = sp.lambdify(z,sym_f.diff().diff(),'numpy')
x = np.arange(-10,4,.1) # Generate a numerical range
p = figure(height=300,width=700)
p.line(x,f(x),line_width=3)
show(p)
"Newton–Raphson method" is a method for finding successively better approximations to the roots (or zeroes) of a real-valued function (Wiki).
Goal is to find the $x^*$ to minimize $f(x)$.
As we seen, if $f(x)$ is convex, we need to located $x^*$ such that $f′ (x^∗) = 0$
Let $g(x) = f′(x)$. Take Taylor expansion about the optimum solution $x^∗$ (see reading on what a Taylor series expansion is but all in all it's just an polynomial approximation for the shape of a function at a given point):
$$g(x^∗ ) = g(x) + (x^∗ − x)g'(x) + \text{“negligible” higher order terms} $$
Because $g(x ∗ ) = f′(x^∗ ) = 0$,
$$ 0 \approx g(x) + (x^∗ − x)g′(x) \to x^∗ \approx x − \frac{h(x)}{h′(x)} = x - \frac{f'(x)}{f′′(x)} $$
x = -1 # Starting value
on = True
while on:
new_x = x - (df(x)/ddf(x))
if np.isclose(new_x,x,.001):
if ddf(new_x)>0:
ctype = "Minima"
elif ddf(new_x)<0:
ctype = "Maxima"
else:
ctype = 'Inflection'
print(f"Converged at {round(x,3)} ({ctype})")
on = False
else:
print(f'current: {round(x,3)}, proposed: {round(new_x,3)}')
x = new_x
x = np.arange(-10,4,.1) # Generate a numerical range
p = figure(height=300,width=700)
p.line(x,f(x),line_width=3,alpha=.3,legend="f(x)")
p.scatter(new_x,f(new_x),color="red",size=6,alpha=.4,legend="proposed x")
p.yaxis.axis_label = 'f(x)'
p.xaxis.axis_label = 'x'
show(p)
What happens when we:
Here we'll only require information about the rate of change of the function. Let's select some initial location ($x$), then let's calculate the derivative at that location, then take a "step" in that direction.
prop_x = 1
step=.05
new_x = prop_x + step*df(prop_x)
new_x
Let's do this repeatedly until the values don't appear to change.
prop_x = -2 # Starting value
step=.05
iters = 100
vals = []
for i in range(iters):
new_x = prop_x - step*df(prop_x)
if np.isclose(new_x,prop_x,.001):
print(f'Converged on iteration {i} at value {round(new_x,3)}')
vals.append([new_x,f(new_x)])
break
else:
prop_x = new_x
vals.append([prop_x,f(prop_x)])
vals = np.array(vals)
x = np.arange(-10,4,.1) # Generate a numerical range
p = figure(height=300,width=700)
p.line(x,f(x),line_width=3,alpha=.3,legend="f(x)")
p.scatter(vals[:,0],vals[:,1],color="red",size=6,alpha=.4,legend="proposed x")
p.yaxis.axis_label = 'f(x)'
p.xaxis.axis_label = 'x'
p2 = figure(height=200,width=700)
p2.line(np.arange(i+1),vals[:,1],line_width=3,alpha=.6,color="red")
p2.xaxis.axis_label = 'Iterations'
p2.yaxis.axis_label = 'Proposed x'
show(column(p,p2))
What happens when we:
How do we ensure we're locating the globals?
To know the expected results, let's simulate some data...
N = 100
x = np.random.normal(0,1,N)
x2 = np.random.normal(0,1,N)
y = 1 + 2*x + 3*x2 + np.random.normal(0,1,N)
# Incorporate as design matrix
X = np.column_stack([np.ones(x.shape),x,x2])
X[1:5,:]
The data generating process
$$ y = \beta_{0} + \beta_1 x_1 + \beta_2 x_2 + \epsilon_i$$
Our function that approximates this linear process
$$ \hat{y}_i = \hat{\beta}_0 + \hat{\beta}_1 x_{1i} + \hat{\beta}_2 x_{2i} = X\hat{\beta} $$
Here we'll leverage the information about the gradient to minimize the cost function. Cost Function in this context is defined as the mean squared error:
$$ cost(y,X,\beta) = \frac{1}{N}\sum_{i=1}^N (y_i - \hat{y}_i)^2 = \frac{e' e}{N}$$
Here is the cost function in algebraic terms...
def cost(y,X,B):
'''
Cost function is the average mean squared error.
'''
x1 = X[:,0]
x2 = X[:,1]
x3 = X[:,2]
e = sum((y - (B[0] + B[1]*x2 + B[2]*x3))**2)
return e/y.shape[0]
cost(y,X,B=np.array([1,2,3]))
And here it is in linear algebra terms...
def cost(y,X,B):
'''
Cost function is the average mean squared error.
'''
e = y - X.dot(B)
n = e.shape
return e.dot(e)/n
cost(y,X,B=np.array([1,2,3]))
$$ -\nabla cost = - \begin{pmatrix} \partial_{\beta_0}\\ \partial_{\beta_1}\\ \partial_{\beta_2} \end{pmatrix} =
def calc_gradients(y,X,B):
'''
Gradient of the cost function
'''
n = y.shape[0]
betas =[]
betas.append(-sum( 2*(y - X.dot(B)) )/n)
for i in range(1,B.shape[0]):
betas.append(-sum( 2*X[:,i]*(y - X.dot(B)) )/n)
return np.array(betas)
Preview how this works...
initial_B = np.array([1,2,3])
calc_gradients(y,X,initial_B)
As we did above, lets use the gradient to take steps in the right direction. We'll establish a "learning rate" that will determine how much we step in a specific direction.
The following code has a lot embellishments but there are only a few
np.random.seed(123) # Seed for reproducibility
B = np.random.rand(3) # initial (random) betas
learning_rate = .1 # Learning rate
iterations = 50 # number of iterations
stats = [] # Let's save the state of the algorithm
for i in range(iterations):
mse = cost(y,X,B) # Calculate the mean squared error
# Print off every 5th iteration...
if i % 5 == 0:
print('iteration:',i,"Loss:",mse[0].round(2),'Coefs:',B.round(2))
# The bit that matters!
gradients = calc_gradients(y,X,B)
B = B - (gradients*learning_rate) # New betas
# Record values
stats.append(np.hstack([mse,B]))
# Convert to
stats = np.array(stats)
Visualize
# Generate plot
p = figure(height=400,width=400)
p.xaxis.axis_label = "Iterations"
p.yaxis.axis_label = "Mean Squared Error"
p.line(np.arange(stats.shape[0]),stats[:,0],line_width=3)
p2 = figure(height=400,width=400)
p2.xaxis.axis_label = "Iterations"
p2.yaxis.axis_label = "Coeffients"
p2.line(np.arange(stats.shape[0]),stats[:,1],color="red",line_width=3,legend="B0")
p2.line(np.arange(stats.shape[0]),stats[:,2],color="orange",line_width=3,legend="B1")
p2.line(np.arange(stats.shape[0]),stats[:,3],color="pink",line_width=3,legend="B2")
# Print plot
show(row(p,p2))
# Our betas retrieved through gradient descent.
B
Compare our model fit to stats model output.
model = smf.ols('y ~ x1 + x2', data=pd.DataFrame(dict(y=y,x1=X[:,1],x2 = X[:,2]))).fit()
model.summary()