Data Science from Scratch: First Principles with Python (2015)
Chapter 19. Clustering
Where we such clusters had
As made us nobly wild, not mad
Robert Herrick
The Idea
The Model
1. Start with a set of k-means, which are points in d-dimensional space.
2. Assign each point to the mean to which it is closest.
3. If no point’s assignment has changed, stop and keep the clusters.
4. If some point’s assignment has changed, recompute the means and return to step 2.
class
KMeans:
"""performs k-means clustering"""
def
__init__(self,
k):
self.k
=
k
# number of clusters
self.means
=
None
# means of clusters
def
classify(self,
input):
"""return the index of the cluster closest to the input"""
return
min(range(self.k),
key=lambda
i:
squared_distance(input,
self.means[i]))
def
train(self,
inputs):
# choose k random points as the initial means
self.means
=
random.sample(inputs,
self.k)
assignments
=
None
while
True:
# Find new assignments
new_assignments
=
map(self.classify,
inputs)
# If no assignments have changed, we're done.
if
assignments
==
new_assignments:
return
# Otherwise keep the new assignments,
assignments
=
new_assignments
# And compute new means based on the new assignments
for
i
in
range(self.k):
# find all the points assigned to cluster i
i_points
=
[
p
for
p,
a
in
zip(inputs,
assignments)
if
a
==
i]
# make sure i_points is not empty so don't divide by 0
if
i_points:
self.means[i]
=
vector_mean(i_points)
Example: Meetups
random.seed(0)
# so you get the same results as me
clusterer
=
KMeans(3)
clusterer.train(inputs)
clusterer.means
Figure 19-1. The locations of your hometown users
random.seed(0)
clusterer
=
KMeans(2)
clusterer.train(inputs)
clusterer.means
Figure 19-2. User locations grouped into three clusters
Figure 19-3. User locations grouped into two clusters
def
squared_clustering_errors(inputs,
k):
"""finds the total squared error from k-means clustering the inputs"""
clusterer
=
KMeans(k)
clusterer.train(inputs)
means
=
clusterer.means
assignments
=
map(clusterer.classify,
inputs)
return
sum(squared_distance(input,
means[cluster])
for
input,
cluster
in
zip(inputs,
assignments))
# now plot from 1 up to len(inputs) clusters
ks
=
range(1,
len(inputs)
+
1)
errors
=
[
squared_clustering_errors(inputs,
k)
for
k
in
ks]
plt.plot(ks,
errors)
plt.xticks(ks)
plt.xlabel("k")
plt.ylabel("total squared error")
plt.title("Total Error vs. # of Clusters")
plt.show()
Figure 19-4. Choosing a k
Example: Clustering Colors
1. Choosing five colors
2. Assigning one of those colors to each pixel
path_to_png_file
=
r"C:\images\image.png"
# wherever your image is
import
matplotlib.image
as
mpimg
img
=
mpimg.imread(path_to_png_file)
top_row
=
img[0]
top_left_pixel
=
top_row[0]
red,
green,
blue
=
top_left_pixel
pixels
=
[
pixel
for
row
in
img
for
pixel
in
row]
clusterer
=
KMeans(5)
clusterer.train(pixels)
# this might take a while
def
recolor(pixel):
cluster
=
clusterer.classify(pixel)
# index of the closest cluster
return
clusterer.means[cluster]
# mean of the closest cluster
new_img
=
[[
recolor(pixel)
for
pixel
in
row]
# recolor this row of pixels
for
row
in
img]
# for each row in the image
plt.imshow(new_img)
plt.axis('off')
plt.show()
Figure 19-5. Original picture and its 5-means decoloring
1. Make each input its own cluster of one.
2. As long as there are multiple clusters remaining, find the two closest clusters and merge them.
leaf1
=
([
10,
20],)
# to make a 1-tuple you need the trailing comma
leaf2
=
([
30,
-
15],)
# otherwise Python treats the parentheses as parentheses
merged
=
(
1,
[
leaf1,
leaf2])
def
is_leaf(cluster):
"""a cluster is a leaf if it has length 1"""
return
len(cluster)
==
1
def
get_children(cluster):
"""returns the two children of this cluster if it's a merged cluster;
raises an exception if this is a leaf cluster"""
if
is_leaf(cluster):
raise
TypeError("a leaf cluster has no children")
else:
return
cluster[1]
def
get_values(cluster):
"""returns the value in this cluster (if it's a leaf cluster)
or all the values in the leaf clusters below it (if it's not)"""
if
is_leaf(cluster):
return
cluster
# is already a 1-tuple containing value
else:
return
[
value
for
child
in
get_children(cluster)
for
value
in
get_values(child)]
def
cluster_distance(cluster1,
cluster2,
distance_agg=min):
"""compute all the pairwise distances between cluster1 and cluster2
and apply _distance_agg_ to the resulting list"""
return
distance_agg([distance(input1,
input2)
for
input1
in
get_values(cluster1)
for
input2
in
get_values(cluster2)])
def
get_merge_order(cluster):
if
is_leaf(cluster):
return
float('inf')
else:
return
cluster[0]
# merge_order is first element of 2-tuple
def
bottom_up_cluster(inputs,
distance_agg=min):
# start with every input a leaf cluster / 1-tuple
clusters
=
[(
input,)
for
input
in
inputs]
# as long as we have more than one cluster left...
while
len(clusters)
>
1:
# find the two closest clusters
c1,
c2
=
min([(cluster1,
cluster2)
for
i,
cluster1
in
enumerate(clusters)
for
cluster2
in
clusters[:i]],
key=lambda
(
x,
y):
cluster_distance(x,
y,
distance_agg))
# remove them from the list of clusters
clusters
=
[
c
for
c
in
clusters
if
c
!=
c1
and
c
!=
c2]
# merge them, using merge_order = # of clusters left
merged_cluster
=
(
len(clusters),
[
c1,
c2])
# and add their merge
clusters.append(merged_cluster)
# when there's only one cluster left, return it
return
clusters[0]
base_cluster
=
bottom_up_cluster(inputs)
(0, [(1, [(3, [(14, [(18, [([19, 28],),
([21, 27],)]),
([20, 23],)]),
([26, 13],)]),
(16, [([11, 15],),
([13, 13],)])]),
(2, [(4, [(5, [(9, [(11, [([-49, 0],),
([-46, 5],)]),
([-41, 8],)]),
([-49, 15],)]),
([-34, -1],)]),
(6, [(7, [(8, [(10, [([-22, -16],),
([-19, -11],)]),
([-25, -9],)]),
(13, [(15, [(17, [([-11, -6],),
([-12, -8],)]),
([-14, -5],)]),
([-18, -3],)])]),
(12, [([-13, -19],),
([-9, -16],)])])])])
§ Cluster 0 is the merger of cluster 1 and cluster 2.
§ Cluster 1 is the merger of cluster 3 and cluster 16.
§ Cluster 16 is the merger of the leaf [11, 15]
and the leaf [13, 13]
.
§ And so on…
def
generate_clusters(base_cluster,
num_clusters):
# start with a list with just the base cluster
clusters
=
[
base_cluster]
# as long as we don't have enough clusters yet...
while
len(clusters)
<
num_clusters:
# choose the last-merged of our clusters
next_cluster
=
min(clusters,
key=get_merge_order)
# remove it from the list
clusters
=
[
c
for
c
in
clusters
if
c
!=
next_cluster]
# and add its children to the list (i.e., unmerge it)
clusters.extend(get_children(next_cluster))
# once we have enough clusters...
return
clusters
three_clusters
=
[
get_values(cluster)
for
cluster
in
generate_clusters(base_cluster,
3)]
for
i,
cluster,
marker,
color
in
zip([1,
2,
3],
three_clusters,
[
'D','o','*'],
[
'r','g','b']):
xs,
ys
=
zip(*cluster)
# magic unzipping trick
plt.scatter(xs,
ys,
color=color,
marker=marker)
# put a number at the mean of the cluster
x,
y
=
vector_mean(cluster)
plt.plot(x,
y,
marker='$'
+
str(i)
+
'$',
color='black')
plt.title("User Locations -- 3 Bottom-Up Clusters, Min")
plt.xlabel("blocks east of city center")
plt.ylabel("blocks north of city center")
plt.show()
Figure 19-6. Three bottom-up clusters using min distance
NOTE
Figure 19-7. Three bottom-up clusters using max distance
§ scikit-learn
has an entire module sklearn.cluster
that contains several clustering algorithms including KMeans
and the Ward
hierarchical clustering algorithm (which uses a different criterion for merging clusters than ours did).
§ SciPy has two clustering models scipy.cluster.vq
(which does k-means) and scipy.cluster.hierarchy
(which has a variety of hierarchical clustering algorithms).