# Data Science from Scratch: First Principles with Python (2015)

### Chapter 22. Recommender Systems

O nature, nature, why art thou so dishonest, as ever to send men with these false recommendations into the world!

Henry Fielding

`users_interests`

`=`

`[`

`[`

`"Hadoop",`

`"Big Data",`

`"HBase",`

`"Java",`

`"Spark",`

`"Storm",`

`"Cassandra"],`

`[`

`"NoSQL",`

`"MongoDB",`

`"Cassandra",`

`"HBase",`

`"Postgres"],`

`[`

`"Python",`

`"scikit-learn",`

`"scipy",`

`"numpy",`

`"statsmodels",`

`"pandas"],`

`[`

`"R",`

`"Python",`

`"statistics",`

`"regression",`

`"probability"],`

`[`

`"machine learning",`

`"regression",`

`"decision trees",`

`"libsvm"],`

`[`

`"Python",`

`"R",`

`"Java",`

`"C++",`

`"Haskell",`

`"programming languages"],`

`[`

`"statistics",`

`"probability",`

`"mathematics",`

`"theory"],`

`[`

`"machine learning",`

`"scikit-learn",`

`"Mahout",`

`"neural networks"],`

`[`

`"neural networks",`

`"deep learning",`

`"Big Data",`

`"artificial intelligence"],`

`[`

`"Hadoop",`

`"Java",`

`"MapReduce",`

`"Big Data"],`

`[`

`"statistics",`

`"R",`

`"statsmodels"],`

`[`

`"C++",`

`"deep learning",`

`"artificial intelligence",`

`"probability"],`

`[`

`"pandas",`

`"R",`

`"Python"],`

`[`

`"databases",`

`"HBase",`

`"Postgres",`

`"MySQL",`

`"MongoDB"],`

`[`

`"libsvm",`

`"regression",`

`"support vector machines"]`

`]`

# Manual Curation

# Recommending What’s Popular

`popular_interests`

`=`

`Counter(interest`

for`user_interests`

in`users_interests`

for`interest`

in`user_interests).most_common()`

`[('Python',`

`4),`

`(`

`'R',`

`4),`

`(`

`'Java',`

`3),`

`(`

`'regression',`

`3),`

`(`

`'statistics',`

`3),`

`(`

`'probability',`

`3),`

` `*# ...*

`]`

def`most_popular_new_interests(user_interests,`

`max_results=5):`

`suggestions`

`=`

`[(`

`interest,`

`frequency)`

for`interest,`

`frequency`

in`popular_interests`

if`interest`

not

in`user_interests]`

return`suggestions[:max_results]`

`["NoSQL",`

`"MongoDB",`

`"Cassandra",`

`"HBase",`

`"Postgres"]`

`most_popular_new_interests(users_interests[1],`

`5)`

*# [('Python', 4), ('R', 4), ('Java', 3), ('regression', 3), ('statistics', 3)]*

`[('Java',`

`3),`

`(`

`'HBase',`

`3),`

`(`

`'Big Data',`

`3),`

`(`

`'neural networks',`

`2),`

`(`

`'Hadoop',`

`2)]`

# User-Based Collaborative Filtering

def`cosine_similarity(v,`

`w):`

return`dot(v,`

`w)`

`/`

`math.sqrt(dot(v,`

`v)`

`*`

`dot(w,`

`w))`

`unique_interests`

`=`

`sorted(list({`

`interest`

for`user_interests`

in`users_interests`

for`interest`

in`user_interests`

`}))`

`['Big Data',`

` ``'C++',`

` ``'Cassandra',`

` ``'HBase',`

` ``'Hadoop',`

` ``'Haskell',`

` `*# ...*

`]`

def`make_user_interest_vector(user_interests):`

` `*"""given a list of interests, produce a vector whose ith element is 1*

* if unique_interests[i] is in the list, 0 otherwise"""*

return`[`

`1`

if`interest`

in`user_interests`

else`0`

for`interest`

in`unique_interests]`

`user_interest_matrix`

`=`

`map(make_user_interest_vector,`

`users_interests)`

`user_similarities`

`=`

`[[`

`cosine_similarity(interest_vector_i,`

`interest_vector_j)`

for`interest_vector_j`

in`user_interest_matrix]`

for`interest_vector_i`

in`user_interest_matrix]`

def`most_similar_users_to(user_id):`

`pairs`

`=`

`[(`

`other_user_id,`

`similarity)`

# find other

for`other_user_id,`

`similarity`

in

# users with

`enumerate(user_similarities[user_id])`

# nonzero

if`user_id`

`!=`

`other_user_id`

and`similarity`

`>`

`0]`

# similarity

return`sorted(pairs,`

# sort them

`key=`

lambda`(`

`_,`

`similarity):`

`similarity,`

# most similar

`reverse=True)`

# first

`[(9,`

`0.5669467095138409),`

`(`

`1,`

`0.3380617018914066),`

`(`

`8,`

`0.1889822365046136),`

`(`

`13,`

`0.1690308509457033),`

`(`

`5,`

`0.1543033499620919)]`

def`user_based_suggestions(user_id,`

`include_current_interests=False):`

` `*# sum up the similarities*

`suggestions`

`=`

`defaultdict(float)`

for`other_user_id,`

`similarity`

in`most_similar_users_to(user_id):`

for`interest`

in`users_interests[other_user_id]:`

`suggestions[interest]`

`+=`

`similarity`

` `*# convert them to a sorted list*

`suggestions`

`=`

`sorted(suggestions.items(),`

`key=`

lambda`(`

`_,`

`weight):`

`weight,`

` ``reverse=True)`

` `*# and (maybe) exclude already-interests*

if`include_current_interests:`

return`suggestions`

` `**else**:

return`[(`

`suggestion,`

`weight)`

for`suggestion,`

`weight`

in`suggestions`

if`suggestion`

not

in`users_interests[user_id]]`

`[('MapReduce',`

`0.5669467095138409),`

`(`

`'MongoDB',`

`0.50709255283711),`

`(`

`'Postgres',`

`0.50709255283711),`

`(`

`'NoSQL',`

`0.3380617018914066),`

`(`

`'neural networks',`

`0.1889822365046136),`

`(`

`'deep learning',`

`0.1889822365046136),`

`(`

`'artificial intelligence',`

`0.1889822365046136),`

` `*#...*

`]`

`interest_user_matrix`

`=`

`[[`

`user_interest_vector[j]`

for`user_interest_vector`

in`user_interest_matrix]`

for`j,`

`_`

in`enumerate(unique_interests)]`

`[1,`

`0,`

`0,`

`0,`

`0,`

`0,`

`0,`

`0,`

`1,`

`1,`

`0,`

`0,`

`0,`

`0,`

`0]`

`interest_similarities`

`=`

`[[`

`cosine_similarity(user_vector_i,`

`user_vector_j)`

for`user_vector_j`

in`interest_user_matrix]`

for`user_vector_i`

in`interest_user_matrix]`

def`most_similar_interests_to(interest_id):`

`similarities`

`=`

`interest_similarities[interest_id]`

`pairs`

`=`

`[(`

`unique_interests[other_interest_id],`

`similarity)`

for`other_interest_id,`

`similarity`

in`enumerate(similarities)`

if`interest_id`

`!=`

`other_interest_id`

and`similarity`

`>`

`0]`

return`sorted(pairs,`

`key=`

lambda`(`

`_,`

`similarity):`

`similarity,`

` ``reverse=True)`

`[('Hadoop',`

`0.8164965809277261),`

`(`

`'Java',`

`0.6666666666666666),`

`(`

`'MapReduce',`

`0.5773502691896258),`

`(`

`'Spark',`

`0.5773502691896258),`

`(`

`'Storm',`

`0.5773502691896258),`

`(`

`'Cassandra',`

`0.4082482904638631),`

`(`

`'artificial intelligence',`

`0.4082482904638631),`

`(`

`'deep learning',`

`0.4082482904638631),`

`(`

`'neural networks',`

`0.4082482904638631),`

`(`

`'HBase',`

`0.3333333333333333)]`

def`item_based_suggestions(user_id,`

`include_current_interests=False):`

` `*# add up the similar interests*

`suggestions`

`=`

`defaultdict(float)`

`user_interest_vector`

`=`

`user_interest_matrix[user_id]`

for`interest_id,`

`is_interested`

in`enumerate(user_interest_vector):`

if`is_interested`

`==`

`1:`

`similar_interests`

`=`

`most_similar_interests_to(interest_id)`

for`interest,`

`similarity`

in`similar_interests:`

`suggestions[interest]`

`+=`

`similarity`

` `*# sort them by weight*

`suggestions`

`=`

`sorted(suggestions.items(),`

`key=`

lambda`(`

`_,`

`similarity):`

`similarity,`

` ``reverse=True)`

if`include_current_interests:`

return`suggestions`

` `**else**:

return`[(`

`suggestion,`

`weight)`

for`suggestion,`

`weight`

in`suggestions`

if`suggestion`

not

in`users_interests[user_id]]`

`[('MapReduce',`

`1.861807319565799),`

`(`

`'Postgres',`

`1.3164965809277263),`

`(`

`'MongoDB',`

`1.3164965809277263),`

`(`

`'NoSQL',`

`1.2844570503761732),`

`(`

`'programming languages',`

`0.5773502691896258),`

`(`

`'MySQL',`

`0.5773502691896258),`

`(`

`'Haskell',`

`0.5773502691896258),`

`(`

`'databases',`

`0.5773502691896258),`

`(`

`'neural networks',`

`0.4082482904638631),`

`(`

`'deep learning',`

`0.4082482904638631),`

`(`

`'C++',`

`0.4082482904638631),`

`(`

`'artificial intelligence',`

`0.4082482904638631),`

`(`

`'Python',`

`0.2886751345948129),`

`(`

`'R',`

`0.2886751345948129)]`

# For Further Exploration

§ Crab is a framework for building recommender systems in Python.

§ Graphlab also has a recommender toolkit.

§ The Netflix Prize was a somewhat famous competition to build a better system to recommend movies to Netflix users.