Data Science from Scratch: First Principles with Python (2015)
Chapter 8. Gradient Descent
Those who boast of their descent, brag on what they owe to others.
Seneca
The Idea Behind Gradient Descent
def
sum_of_squares(v):
"""computes the sum of squared elements in v"""
return
sum(v_i
**
2
for
v_i
in
v)
NOTE
def
difference_quotient(f,
x,
h):
return
(
f(x
+
h)
-
f(x))
/
h
def
square(x):
return
x
*
x
def
derivative(x):
return
2
*
x
derivative_estimate
=
partial(difference_quotient,
square,
h=0.00001)
# plot to show they're basically the same
import
matplotlib.pyplot
as
plt
x
=
range(-10,10)
plt.title("Actual Derivatives vs. Estimates")
plt.plot(x,
map(derivative,
x),
'rx',
label='Actual')
# red x
plt.plot(x,
map(derivative_estimate,
x),
'b+',
label='Estimate')
# blue +
plt.legend(loc=9)
plt.show()
def
partial_difference_quotient(f,
v,
i,
h):
"""compute the ith partial difference quotient of f at v"""
w
=
[
v_j
+
(
h
if
j
==
i
else
0)
# add h to just the ith element of v
for
j,
v_j
in
enumerate(v)]
return
(
f(w)
-
f(v))
/
h
def
estimate_gradient(f,
v,
h=0.00001):
return
[
partial_difference_quotient(f,
v,
i,
h)
for
i,
_
in
enumerate(v)]
NOTE
def
step(v,
direction,
step_size):
"""move step_size in the direction from v"""
return
[
v_i
+
step_size
*
direction_i
for
v_i,
direction_i
in
zip(v,
direction)]
def
sum_of_squares_gradient(v):
return
[
2
*
v_i
for
v_i
in
v]
# pick a random starting point
v
=
[
random.randint(-10,10)
for
i
in
range(3)]
tolerance
=
0.0000001
while
True:
gradient
=
sum_of_squares_gradient(v)
# compute the gradient at v
next_v
=
step(v,
gradient,
-
0.01)
# take a negative gradient step
if
distance(next_v,
v)
<
tolerance:
# stop if we're converging
break
v
=
next_v
# continue if we're not
Choosing the Right Step Size
§ Using a fixed step size
§ Gradually shrinking the step size over time
§ At each step, choosing the step size that minimizes the value of the objective function
step_sizes
=
[
100,
10,
1,
0.1,
0.01,
0.001,
0.0001,
0.00001]
def
safe(f):
"""return a new function that's the same as f,
except that it outputs infinity whenever f produces an error"""
def
safe_f(*args,
**
kwargs):
try:
return
f(*args,
**
kwargs)
except:
return
float('inf')
# this means "infinity" in Python
return
safe_f
Putting It All Together
def
minimize_batch(target_fn,
gradient_fn,
theta_0,
tolerance=0.000001):
"""use gradient descent to find theta that minimizes target function"""
step_sizes
=
[
100,
10,
1,
0.1,
0.01,
0.001,
0.0001,
0.00001]
theta
=
theta_0
# set theta to initial value
target_fn
=
safe(target_fn)
# safe version of target_fn
value
=
target_fn(theta)
# value we're minimizing
while
True:
gradient
=
gradient_fn(theta)
next_thetas
=
[
step(theta,
gradient,
-
step_size)
for
step_size
in
step_sizes]
# choose the one that minimizes the error function
next_theta
=
min(next_thetas,
key=target_fn)
next_value
=
target_fn(next_theta)
# stop if we're "converging"
if
abs(value
-
next_value)
<
tolerance:
return
theta
else:
theta,
value
=
next_theta,
next_value
def
negate(f):
"""return a function that for any input x returns -f(x)"""
return
lambda
*
args,
**
kwargs:
-
f(*args,
**
kwargs)
def
negate_all(f):
"""the same when f returns a list of numbers"""
return
lambda
*
args,
**
kwargs:
[-
y
for
y
in
f(*args,
**
kwargs)]
def
maximize_batch(target_fn,
gradient_fn,
theta_0,
tolerance=0.000001):
return
minimize_batch(negate(target_fn),
negate_all(gradient_fn),
theta_0,
tolerance)
Stochastic Gradient Descent
def
in_random_order(data):
"""generator that returns the elements of data in random order"""
indexes
=
[
i
for
i,
_
in
enumerate(data)]
# create a list of indexes
random.shuffle(indexes)
# shuffle them
for
i
in
indexes:
# return the data in that order
yield
data[i]
def
minimize_stochastic(target_fn,
gradient_fn,
x,
y,
theta_0,
alpha_0=0.01):
data
=
zip(x,
y)
theta
=
theta_0
# initial guess
alpha
=
alpha_0
# initial step size
min_theta,
min_value
=
None,
float("inf")
# the minimum so far
iterations_with_no_improvement
=
0
# if we ever go 100 iterations with no improvement, stop
while
iterations_with_no_improvement
<
100:
value
=
sum(
target_fn(x_i,
y_i,
theta)
for
x_i,
y_i
in
data
)
if
value
<
min_value:
# if we've found a new minimum, remember it
# and go back to the original step size
min_theta,
min_value
=
theta,
value
iterations_with_no_improvement
=
0
alpha
=
alpha_0
else:
# otherwise we're not improving, so try shrinking the step size
iterations_with_no_improvement
+=
1
alpha
*=
0.9
# and take a gradient step for each of the data points
for
x_i,
y_i
in
in_random_order(data):
gradient_i
=
gradient_fn(x_i,
y_i,
theta)
theta
=
vector_subtract(theta,
scalar_multiply(alpha,
gradient_i))
return
min_theta
def
maximize_stochastic(target_fn,
gradient_fn,
x,
y,
theta_0,
alpha_0=0.01):
return
minimize_stochastic(negate(target_fn),
negate_all(gradient_fn),
x,
y,
theta_0,
alpha_0)
For Further Exploration
§ Keep reading! We’ll be using gradient descent to solve problems throughout the rest of the book.
§ At this point, you’re undoubtedly sick of me recommending that you read textbooks. If it’s any consolation, Active Calculus seems nicer than the calculus textbooks I learned from.
§ scikit-learn has a Stochastic Gradient Descent module that is not as general as ours in some ways and more general in other ways. Really, though, in most real-world situations you’ll be using libraries in which the optimization is already taken care of behind the scenes, and you won’t have to worry about it yourself (other than when it doesn’t work correctly, which one day, inevitably, it won’t).