# Data Science from Scratch: First Principles with Python (2015)

### Chapter 8. Gradient Descent

Those who boast of their descent, brag on what they owe to others.

Seneca

# The Idea Behind Gradient Descent

def`sum_of_squares(v):`

` `*"""computes the sum of squared elements in v"""*

return`sum(v_i`

`**`

`2`

for`v_i`

in`v)`

###### NOTE

def`difference_quotient(f,`

`x,`

`h):`

return`(`

`f(x`

`+`

`h)`

`-`

`f(x))`

`/`

`h`

def`square(x):`

return`x`

`*`

`x`

def`derivative(x):`

return`2`

`*`

`x`

`derivative_estimate`

`=`

`partial(difference_quotient,`

`square,`

`h=0.00001)`

*# plot to show they're basically the same*

import

matplotlib.pyplot

as

plt

`x`

`=`

`range(-10,10)`

`plt.title("Actual Derivatives vs. Estimates")`

`plt.plot(x,`

`map(derivative,`

`x),`

`'rx',`

`label='Actual')`

# red x

`plt.plot(x,`

`map(derivative_estimate,`

`x),`

`'b+',`

`label='Estimate')`

# blue +

`plt.legend(loc=9)`

`plt.show()`

def`partial_difference_quotient(f,`

`v,`

`i,`

`h):`

` `*"""compute the ith partial difference quotient of f at v"""*

`w`

`=`

`[`

`v_j`

`+`

`(`

`h`

if`j`

`==`

`i`

else`0)`

# add h to just the ith element of v

for`j,`

`v_j`

in`enumerate(v)]`

return`(`

`f(w)`

`-`

`f(v))`

`/`

`h`

def`estimate_gradient(f,`

`v,`

`h=0.00001):`

return`[`

`partial_difference_quotient(f,`

`v,`

`i,`

`h)`

for`i,`

`_`

in`enumerate(v)]`

###### NOTE

def`step(v,`

`direction,`

`step_size):`

` `*"""move step_size in the direction from v"""*

return`[`

`v_i`

`+`

`step_size`

`*`

`direction_i`

for`v_i,`

`direction_i`

in`zip(v,`

`direction)]`

def`sum_of_squares_gradient(v):`

return`[`

`2`

`*`

`v_i`

for`v_i`

in`v]`

*# pick a random starting point*

`v`

`=`

`[`

`random.randint(-10,10)`

for`i`

in`range(3)]`

`tolerance`

`=`

`0.0000001`

while`True:`

`gradient`

`=`

`sum_of_squares_gradient(v)`

# compute the gradient at v

`next_v`

`=`

`step(v,`

`gradient,`

`-`

`0.01)`

# take a negative gradient step

if`distance(next_v,`

`v)`

`<`

`tolerance:`

# stop if we're converging

` `**break**

`v`

`=`

`next_v`

# continue if we're not

# Choosing the Right Step Size

§ Using a fixed step size

§ Gradually shrinking the step size over time

§ At each step, choosing the step size that minimizes the value of the objective function

`step_sizes`

`=`

`[`

`100,`

`10,`

`1,`

`0.1,`

`0.01,`

`0.001,`

`0.0001,`

`0.00001]`

def`safe(f):`

` `*"""return a new function that's the same as f,*

* except that it outputs infinity whenever f produces an error"""*

def`safe_f(*args,`

`**`

`kwargs):`

` `**try**:

return`f(*args,`

`**`

`kwargs)`

` `**except**:

return`float('inf')`

# this means "infinity" in Python

return`safe_f`

# Putting It All Together

def`minimize_batch(target_fn,`

`gradient_fn,`

`theta_0,`

`tolerance=0.000001):`

` `*"""use gradient descent to find theta that minimizes target function"""*

`step_sizes`

`=`

`[`

`100,`

`10,`

`1,`

`0.1,`

`0.01,`

`0.001,`

`0.0001,`

`0.00001]`

`theta`

`=`

`theta_0`

# set theta to initial value

`target_fn`

`=`

`safe(target_fn)`

# safe version of target_fn

`value`

`=`

`target_fn(theta)`

# value we're minimizing

while`True:`

`gradient`

`=`

`gradient_fn(theta)`

`next_thetas`

`=`

`[`

`step(theta,`

`gradient,`

`-`

`step_size)`

for`step_size`

in`step_sizes]`

` `*# choose the one that minimizes the error function*

`next_theta`

`=`

`min(next_thetas,`

`key=target_fn)`

`next_value`

`=`

`target_fn(next_theta)`

` `*# stop if we're "converging"*

if`abs(value`

`-`

`next_value)`

`<`

`tolerance:`

return`theta`

` `**else**:

`theta,`

`value`

`=`

`next_theta,`

`next_value`

def`negate(f):`

` `*"""return a function that for any input x returns -f(x)"""*

return

lambda`*`

`args,`

`**`

`kwargs:`

`-`

`f(*args,`

`**`

`kwargs)`

def`negate_all(f):`

` `*"""the same when f returns a list of numbers"""*

return

lambda`*`

`args,`

`**`

`kwargs:`

`[-`

`y`

for`y`

in`f(*args,`

`**`

`kwargs)]`

def`maximize_batch(target_fn,`

`gradient_fn,`

`theta_0,`

`tolerance=0.000001):`

return`minimize_batch(negate(target_fn),`

` ``negate_all(gradient_fn),`

` ``theta_0,`

` ``tolerance)`

# Stochastic Gradient Descent

def`in_random_order(data):`

` `*"""generator that returns the elements of data in random order"""*

`indexes`

`=`

`[`

`i`

for`i,`

`_`

in`enumerate(data)]`

# create a list of indexes

`random.shuffle(indexes)`

# shuffle them

for`i`

in`indexes:`

# return the data in that order

yield`data[i]`

def`minimize_stochastic(target_fn,`

`gradient_fn,`

`x,`

`y,`

`theta_0,`

`alpha_0=0.01):`

`data`

`=`

`zip(x,`

`y)`

`theta`

`=`

`theta_0`

# initial guess

`alpha`

`=`

`alpha_0`

# initial step size

`min_theta,`

`min_value`

`=`

`None,`

`float("inf")`

# the minimum so far

`iterations_with_no_improvement`

`=`

`0`

` `*# if we ever go 100 iterations with no improvement, stop*

while`iterations_with_no_improvement`

`<`

`100:`

`value`

`=`

`sum(`

`target_fn(x_i,`

`y_i,`

`theta)`

for`x_i,`

`y_i`

in`data`

`)`

if`value`

`<`

`min_value:`

` `*# if we've found a new minimum, remember it*

` `*# and go back to the original step size*

`min_theta,`

`min_value`

`=`

`theta,`

`value`

`iterations_with_no_improvement`

`=`

`0`

`alpha`

`=`

`alpha_0`

` `**else**:

` `*# otherwise we're not improving, so try shrinking the step size*

`iterations_with_no_improvement`

`+=`

`1`

`alpha`

`*=`

`0.9`

` `*# and take a gradient step for each of the data points*

for`x_i,`

`y_i`

in`in_random_order(data):`

`gradient_i`

`=`

`gradient_fn(x_i,`

`y_i,`

`theta)`

`theta`

`=`

`vector_subtract(theta,`

`scalar_multiply(alpha,`

`gradient_i))`

return`min_theta`

def`maximize_stochastic(target_fn,`

`gradient_fn,`

`x,`

`y,`

`theta_0,`

`alpha_0=0.01):`

return`minimize_stochastic(negate(target_fn),`

` ``negate_all(gradient_fn),`

`x,`

`y,`

`theta_0,`

`alpha_0)`

# For Further Exploration

§ Keep reading! We’ll be using gradient descent to solve problems throughout the rest of the book.

§ At this point, you’re undoubtedly sick of me recommending that you read textbooks. If it’s any consolation, Active Calculus seems nicer than the calculus textbooks I learned from.

§ scikit-learn has a Stochastic Gradient Descent module that is not as general as ours in some ways and more general in other ways. Really, though, in most real-world situations you’ll be using libraries in which the optimization is already taken care of behind the scenes, and you won’t have to worry about it yourself (other than when it doesn’t work correctly, which one day, inevitably, it won’t).