Data Science from Scratch: First Principles with Python (2015)
Chapter 22. Recommender Systems
O nature, nature, why art thou so dishonest, as ever to send men with these false recommendations into the world!
Henry Fielding
users_interests
=
[
[
"Hadoop",
"Big Data",
"HBase",
"Java",
"Spark",
"Storm",
"Cassandra"],
[
"NoSQL",
"MongoDB",
"Cassandra",
"HBase",
"Postgres"],
[
"Python",
"scikit-learn",
"scipy",
"numpy",
"statsmodels",
"pandas"],
[
"R",
"Python",
"statistics",
"regression",
"probability"],
[
"machine learning",
"regression",
"decision trees",
"libsvm"],
[
"Python",
"R",
"Java",
"C++",
"Haskell",
"programming languages"],
[
"statistics",
"probability",
"mathematics",
"theory"],
[
"machine learning",
"scikit-learn",
"Mahout",
"neural networks"],
[
"neural networks",
"deep learning",
"Big Data",
"artificial intelligence"],
[
"Hadoop",
"Java",
"MapReduce",
"Big Data"],
[
"statistics",
"R",
"statsmodels"],
[
"C++",
"deep learning",
"artificial intelligence",
"probability"],
[
"pandas",
"R",
"Python"],
[
"databases",
"HBase",
"Postgres",
"MySQL",
"MongoDB"],
[
"libsvm",
"regression",
"support vector machines"]
]
Manual Curation
Recommending What’s Popular
popular_interests
=
Counter(interest
for
user_interests
in
users_interests
for
interest
in
user_interests).most_common()
[('Python',
4),
(
'R',
4),
(
'Java',
3),
(
'regression',
3),
(
'statistics',
3),
(
'probability',
3),
# ...
]
def
most_popular_new_interests(user_interests,
max_results=5):
suggestions
=
[(
interest,
frequency)
for
interest,
frequency
in
popular_interests
if
interest
not
in
user_interests]
return
suggestions[:max_results]
["NoSQL",
"MongoDB",
"Cassandra",
"HBase",
"Postgres"]
most_popular_new_interests(users_interests[1],
5)
# [('Python', 4), ('R', 4), ('Java', 3), ('regression', 3), ('statistics', 3)]
[('Java',
3),
(
'HBase',
3),
(
'Big Data',
3),
(
'neural networks',
2),
(
'Hadoop',
2)]
User-Based Collaborative Filtering
def
cosine_similarity(v,
w):
return
dot(v,
w)
/
math.sqrt(dot(v,
v)
*
dot(w,
w))
unique_interests
=
sorted(list({
interest
for
user_interests
in
users_interests
for
interest
in
user_interests
}))
['Big Data',
'C++',
'Cassandra',
'HBase',
'Hadoop',
'Haskell',
# ...
]
def
make_user_interest_vector(user_interests):
"""given a list of interests, produce a vector whose ith element is 1
if unique_interests[i] is in the list, 0 otherwise"""
return
[
1
if
interest
in
user_interests
else
0
for
interest
in
unique_interests]
user_interest_matrix
=
map(make_user_interest_vector,
users_interests)
user_similarities
=
[[
cosine_similarity(interest_vector_i,
interest_vector_j)
for
interest_vector_j
in
user_interest_matrix]
for
interest_vector_i
in
user_interest_matrix]
def
most_similar_users_to(user_id):
pairs
=
[(
other_user_id,
similarity)
# find other
for
other_user_id,
similarity
in
# users with
enumerate(user_similarities[user_id])
# nonzero
if
user_id
!=
other_user_id
and
similarity
>
0]
# similarity
return
sorted(pairs,
# sort them
key=lambda
(
_,
similarity):
similarity,
# most similar
reverse=True)
# first
[(9,
0.5669467095138409),
(
1,
0.3380617018914066),
(
8,
0.1889822365046136),
(
13,
0.1690308509457033),
(
5,
0.1543033499620919)]
def
user_based_suggestions(user_id,
include_current_interests=False):
# sum up the similarities
suggestions
=
defaultdict(float)
for
other_user_id,
similarity
in
most_similar_users_to(user_id):
for
interest
in
users_interests[other_user_id]:
suggestions[interest]
+=
similarity
# convert them to a sorted list
suggestions
=
sorted(suggestions.items(),
key=lambda
(
_,
weight):
weight,
reverse=True)
# and (maybe) exclude already-interests
if
include_current_interests:
return
suggestions
else:
return
[(
suggestion,
weight)
for
suggestion,
weight
in
suggestions
if
suggestion
not
in
users_interests[user_id]]
[('MapReduce',
0.5669467095138409),
(
'MongoDB',
0.50709255283711),
(
'Postgres',
0.50709255283711),
(
'NoSQL',
0.3380617018914066),
(
'neural networks',
0.1889822365046136),
(
'deep learning',
0.1889822365046136),
(
'artificial intelligence',
0.1889822365046136),
#...
]
interest_user_matrix
=
[[
user_interest_vector[j]
for
user_interest_vector
in
user_interest_matrix]
for
j,
_
in
enumerate(unique_interests)]
[1,
0,
0,
0,
0,
0,
0,
0,
1,
1,
0,
0,
0,
0,
0]
interest_similarities
=
[[
cosine_similarity(user_vector_i,
user_vector_j)
for
user_vector_j
in
interest_user_matrix]
for
user_vector_i
in
interest_user_matrix]
def
most_similar_interests_to(interest_id):
similarities
=
interest_similarities[interest_id]
pairs
=
[(
unique_interests[other_interest_id],
similarity)
for
other_interest_id,
similarity
in
enumerate(similarities)
if
interest_id
!=
other_interest_id
and
similarity
>
0]
return
sorted(pairs,
key=lambda
(
_,
similarity):
similarity,
reverse=True)
[('Hadoop',
0.8164965809277261),
(
'Java',
0.6666666666666666),
(
'MapReduce',
0.5773502691896258),
(
'Spark',
0.5773502691896258),
(
'Storm',
0.5773502691896258),
(
'Cassandra',
0.4082482904638631),
(
'artificial intelligence',
0.4082482904638631),
(
'deep learning',
0.4082482904638631),
(
'neural networks',
0.4082482904638631),
(
'HBase',
0.3333333333333333)]
def
item_based_suggestions(user_id,
include_current_interests=False):
# add up the similar interests
suggestions
=
defaultdict(float)
user_interest_vector
=
user_interest_matrix[user_id]
for
interest_id,
is_interested
in
enumerate(user_interest_vector):
if
is_interested
==
1:
similar_interests
=
most_similar_interests_to(interest_id)
for
interest,
similarity
in
similar_interests:
suggestions[interest]
+=
similarity
# sort them by weight
suggestions
=
sorted(suggestions.items(),
key=lambda
(
_,
similarity):
similarity,
reverse=True)
if
include_current_interests:
return
suggestions
else:
return
[(
suggestion,
weight)
for
suggestion,
weight
in
suggestions
if
suggestion
not
in
users_interests[user_id]]
[('MapReduce',
1.861807319565799),
(
'Postgres',
1.3164965809277263),
(
'MongoDB',
1.3164965809277263),
(
'NoSQL',
1.2844570503761732),
(
'programming languages',
0.5773502691896258),
(
'MySQL',
0.5773502691896258),
(
'Haskell',
0.5773502691896258),
(
'databases',
0.5773502691896258),
(
'neural networks',
0.4082482904638631),
(
'deep learning',
0.4082482904638631),
(
'C++',
0.4082482904638631),
(
'artificial intelligence',
0.4082482904638631),
(
'Python',
0.2886751345948129),
(
'R',
0.2886751345948129)]
For Further Exploration
§ Crab is a framework for building recommender systems in Python.
§ Graphlab also has a recommender toolkit.
§ The Netflix Prize was a somewhat famous competition to build a better system to recommend movies to Netflix users.