# Data Science from Scratch: First Principles with Python (2015)

### Chapter 2. A Crash Course in Python

People are still crazy about Python after twenty-five years, which I find hard to believe.

Michael Palin

# The Basics

## Getting Python

pip install ipython

## The Zen of Python

There should be one — and preferably only one — obvious way to do it.

## Whitespace Formatting

for`i`

in`[`

`1,`

`2,`

`3,`

`4,`

`5]:`

`i`

# first line in "for i" block

for`j`

in`[`

`1,`

`2,`

`3,`

`4,`

`5]:`

`j`

# first line in "for j" block

`i`

`+`

`j`

# last line in "for j" block

`i`

# last line in "for i" block

`"done looping"`

`long_winded_computation`

`=`

`(`

`1`

`+`

`2`

`+`

`3`

`+`

`4`

`+`

`5`

`+`

`6`

`+`

`7`

`+`

`8`

`+`

`9`

`+`

`10`

`+`

`11`

`+`

`12`

`+`

`13`

`+`

`14`

`+`

`15`

`+`

`16`

`+`

`17`

`+`

`18`

`+`

`19`

`+`

`20)`

`list_of_lists`

`=`

`[[`

`1,`

`2,`

`3],`

`[`

`4,`

`5,`

`6],`

`[`

`7,`

`8,`

`9]]`

`easier_to_read_list_of_lists`

`=`

`[`

`[`

`1,`

`2,`

`3],`

`[`

`4,`

`5,`

`6],`

`[`

`7,`

`8,`

`9]`

`]`

`two_plus_three`

`=`

`2`

`+`

\

` ``3`

for`i`

in`[`

`1,`

`2,`

`3,`

`4,`

`5]:`

` `*# notice the blank line*

`i`

IndentationError:`expected`

`an`

`indented`

`block`

## Modules

import

re

`my_regex`

`=`

`re.compile("[0-9]+",`

`re.I)`

import

re

as

regex

`my_regex`

`=`

`regex.compile("[0-9]+",`

`regex.I)`

import

matplotlib.pyplot

as

plt

from

collections

import`defaultdict,`

`Counter`

`lookup`

`=`

`defaultdict(int)`

`my_counter`

`=`

`Counter()`

`match`

`=`

`10`

from

re

import`*`

# uh oh, re has a match function

`match`

# "<function re.match>"

## Arithmetic

from

__future__

import`division`

## Functions

def`double(x):`

` `*"""this is where you put an optional docstring*

* that explains what the function does.*

* for example, this function multiplies its input by 2"""*

return`x`

`*`

`2`

def`apply_to_one(f):`

` `*"""calls the function f with 1 as its argument"""*

return`f(1)`

`my_double`

`=`

`double`

# refers to the previously defined function

`x`

`=`

`apply_to_one(my_double)`

# equals 2

`y`

`=`

`apply_to_one(`

lambda`x:`

`x`

`+`

`4)`

# equals 5

`another_double`

`=`

lambda`x:`

`2`

`*`

`x`

# don't do this

def`another_double(x):`

return`2`

`*`

`x`

# do this instead

def`my_print(message="my default message"):`

`message`

`my_print("hello")`

# prints 'hello'

`my_print()`

# prints 'my default message'

def`subtract(a=0,`

`b=0):`

return`a`

`-`

`b`

`subtract(10,`

`5)`

# returns 5

`subtract(0,`

`5)`

# returns -5

`subtract(b=5)`

# same as previous

## Strings

`single_quoted_string`

`=`

`'data science'`

`double_quoted_string`

`=`

`"data science"`

`tab_string`

`=`

`"`

\t"

# represents the tab character

`len(tab_string)`

# is 1

`not_tab_string`

`=`

`r"\t"`

# represents the characters '\' and 't'

`len(not_tab_string)`

# is 2

`multi_line_string`

`=`

`"""This is the first line.`

`and this is the second line`

`and this is the third line"""`

## Exceptions

**try**:

`0`

`/`

`0`

except

ZeroDivisionError:

`"cannot divide by zero"`

## Lists

`integer_list`

`=`

`[`

`1,`

`2,`

`3]`

`heterogeneous_list`

`=`

`[`

`"string",`

`0.1,`

`True]`

`list_of_lists`

`=`

`[`

`integer_list,`

`heterogeneous_list,`

`[]`

`]`

`list_length`

`=`

`len(integer_list)`

# equals 3

`listum`

`=`

`sum(integer_list)`

# equals 6

`x`

`=`

`range(10)`

# is the list [0, 1, ..., 9]

`zero`

`=`

`x[0]`

# equals 0, lists are 0-indexed

`one`

`=`

`x[1]`

# equals 1

`nine`

`=`

`x[-1]`

# equals 9, 'Pythonic' for last element

`eight`

`=`

`x[-2]`

# equals 8, 'Pythonic' for next-to-last element

`x[0]`

`=`

`-`

`1`

# now x is [-1, 1, 2, 3, ..., 9]

`first_three`

`=`

`x[:3]`

# [-1, 1, 2]

`three_to_end`

`=`

`x[3:]`

# [3, 4, ..., 9]

`one_to_four`

`=`

`x[1:5]`

# [1, 2, 3, 4]

`last_three`

`=`

`x[-3:]`

# [7, 8, 9]

`without_first_and_last`

`=`

`x[1:-1]`

# [1, 2, ..., 8]

`copy_of_x`

`=`

`x[:]`

# [-1, 1, 2, ..., 9]

`1`

in`[`

`1,`

`2,`

`3]`

# True

`0`

in`[`

`1,`

`2,`

`3]`

# False

`x`

`=`

`[`

`1,`

`2,`

`3]`

`x.extend([4,`

`5,`

`6])`

# x is now [1,2,3,4,5,6]

`x`

`=`

`[`

`1,`

`2,`

`3]`

`y`

`=`

`x`

`+`

`[`

`4,`

`5,`

`6]`

# y is [1, 2, 3, 4, 5, 6]; x is unchanged

`x`

`=`

`[`

`1,`

`2,`

`3]`

`x.append(0)`

# x is now [1, 2, 3, 0]

`y`

`=`

`x[-1]`

# equals 0

`z`

`=`

`len(x)`

# equals 4

`x,`

`y`

`=`

`[`

`1,`

`2]`

# now x is 1, y is 2

`_,`

`y`

`=`

`[`

`1,`

`2]`

# now y == 2, didn't care about the first element

## Tuples

`my_list`

`=`

`[`

`1,`

`2]`

`my_tuple`

`=`

`(`

`1,`

`2)`

`other_tuple`

`=`

`3,`

`4`

`my_list[1]`

`=`

`3`

# my_list is now [1, 3]

**try**:

`my_tuple[1]`

`=`

`3`

except

TypeError:

`"cannot modify a tuple"`

def`sum_and_product(x,`

`y):`

return`(`

`x`

`+`

`y),(x`

`*`

`y)`

`sp`

`=`

`sum_and_product(2,`

`3)`

# equals (5, 6)

`s,`

`p`

`=`

`sum_and_product(5,`

`10)`

# s is 15, p is 50

`x,`

`y`

`=`

`1,`

`2`

# now x is 1, y is 2

`x,`

`y`

`=`

`y,`

`x`

# Pythonic way to swap variables; now x is 2, y is 1

## Dictionaries

`empty_dict`

`=`

`{}`

# Pythonic

`empty_dict2`

`=`

`dict()`

# less Pythonic

`grades`

`=`

`{`

`"Joel"`

`:`

`80,`

`"Tim"`

`:`

`95`

`}`

# dictionary literal

`joels_grade`

`=`

`grades["Joel"]`

# equals 80

**try**:

`kates_grade`

`=`

`grades["Kate"]`

except

KeyError:

`"no grade for Kate!"`

`joel_has_grade`

`=`

`"Joel"`

in`grades`

# True

`kate_has_grade`

`=`

`"Kate"`

in`grades`

# False

`joels_grade`

`=`

`grades.get("Joel",`

`0)`

# equals 80

`kates_grade`

`=`

`grades.get("Kate",`

`0)`

# equals 0

`no_ones_grade`

`=`

`grades.get("No One")`

# default default is None

`grades["Tim"]`

`=`

`99`

# replaces the old value

`grades["Kate"]`

`=`

`100`

# adds a third entry

`num_students`

`=`

`len(grades)`

# equals 3

`tweet`

`=`

`{`

`"user"`

`:`

`"joelgrus",`

`"text"`

`:`

`"Data Science is Awesome",`

`"retweet_count"`

`:`

`100,`

`"hashtags"`

`:`

`[`

`"#data",`

`"#science",`

`"#datascience",`

`"#awesome",`

`"#yolo"]`

`}`

`tweet_keys`

`=`

`tweet.keys()`

# list of keys

`tweet_values`

`=`

`tweet.values()`

# list of values

`tweet_items`

`=`

`tweet.items()`

# list of (key, value) tuples

`"user"`

in`tweet_keys`

# True, but uses a slow list in

`"user"`

in`tweet`

# more Pythonic, uses faster dict in

`"joelgrus"`

in`tweet_values`

# True

### defaultdict

`word_counts`

`=`

`{}`

for`word`

in`document:`

if`word`

in`word_counts:`

`word_counts[word]`

`+=`

`1`

` `**else**:

`word_counts[word]`

`=`

`1`

`word_counts`

`=`

`{}`

for`word`

in`document:`

` `**try**:

`word_counts[word]`

`+=`

`1`

except

KeyError:

`word_counts[word]`

`=`

`1`

`word_counts`

`=`

`{}`

for`word`

in`document:`

`previous_count`

`=`

`word_counts.get(word,`

`0)`

`word_counts[word]`

`=`

`previous_count`

`+`

`1`

from

collections

import`defaultdict`

`word_counts`

`=`

`defaultdict(int)`

# int() produces 0

for`word`

in`document:`

`word_counts[word]`

`+=`

`1`

`dd_list`

`=`

`defaultdict(list)`

# list() produces an empty list

`dd_list[2].append(1)`

# now dd_list contains {2: [1]}

`dd_dict`

`=`

`defaultdict(dict)`

# dict() produces an empty dict

`dd_dict["Joel"]["City"]`

`=`

`"Seattle"`

# { "Joel" : { "City" : Seattle"}}

`dd_pair`

`=`

`defaultdict(`

lambda:`[`

`0,`

`0])`

`dd_pair[2][1]`

`=`

`1`

# now dd_pair contains {2: [0,1]}

from

collections

import`Counter`

`c`

`=`

`Counter([0,`

`1,`

`2,`

`0])`

# c is (basically) { 0 : 2, 1 : 1, 2 : 1 }

`word_counts`

`=`

`Counter(document)`

*# print the 10 most common words and their counts*

for`word,`

`count`

in`word_counts.most_common(10):`

`word,`

`count`

`s`

`=`

`set()`

`s.add(1)`

# s is now { 1 }

`s.add(2)`

# s is now { 1, 2 }

`s.add(2)`

# s is still { 1, 2 }

`x`

`=`

`len(s)`

# equals 2

`y`

`=`

`2`

in`s`

# equals True

`z`

`=`

`3`

in`s`

# equals False

`stopwords_list`

`=`

`[`

`"a","an","at"]`

`+`

`hundreds_of_other_words`

`+`

`[`

`"yet",`

`"you"]`

`"zip"`

in`stopwords_list`

# False, but have to check every element

`stopwords_set`

`=`

`set(stopwords_list)`

`"zip"`

in`stopwords_set`

# very fast to check

`item_list`

`=`

`[`

`1,`

`2,`

`3,`

`1,`

`2,`

`3]`

`num_items`

`=`

`len(item_list)`

# 6

`item_set`

`=`

`set(item_list)`

# {1, 2, 3}

`num_distinct_items`

`=`

`len(item_set)`

# 3

`distinct_item_list`

`=`

`list(item_set)`

# [1, 2, 3]

## Control Flow

if`1`

`>`

`2:`

`message`

`=`

`"if only 1 were greater than two..."`

elif`1`

`>`

`3:`

`message`

`=`

`"elif stands for 'else if'"`

**else**:

`message`

`=`

`"when all else fails use else (if you want to)"`

`parity`

`=`

`"even"`

if`x`

`%`

`2`

`==`

`0`

else`"odd"`

`x`

`=`

`0`

while`x`

`<`

`10:`

`x,`

`"is less than 10"`

`x`

`+=`

`1`

for`x`

in`range(10):`

`x,`

`"is less than 10"`

for`x`

in`range(10):`

if`x`

`==`

`3:`

continue

# go immediately to the next iteration

if`x`

`==`

`5:`

break

# quit the loop entirely

`x`

## Truthiness

`one_is_less_than_two`

`=`

`1`

`<`

`2`

# equals True

`true_equals_false`

`=`

`True`

`==`

`False`

# equals False

`x`

`=`

`None`

`x`

`==`

`None`

# prints True, but is not Pythonic

`x`

is`None`

# prints True, and is Pythonic

§ `False`

§ `None`

§ `[]`

(an empty `list`

)

§ `{}`

(an empty `dict`

)

§ `""`

§ `set()`

§ `0`

§ `0.0`

if`s:`

`first_char`

`=`

`s[0]`

**else**:

`first_char`

`=`

`""`

`first_char`

`=`

`s`

and`s[0]`

`safe_x`

`=`

`x`

or`0`

`all([True,`

`1,`

`{`

`3`

`}])`

# True

`all([True,`

`1,`

`{}])`

# False, {} is falsy

`any([True,`

`1,`

`{}])`

# True, True is truthy

`all([])`

# True, no falsy elements in the list

`any([])`

# False, no truthy elements in the list

`x`

`=`

`[`

`4,1,2,3]`

`y`

`=`

`sorted(x)`

# is [1,2,3,4], x is unchanged

`x.sort()`

# now x is [1,2,3,4]

*# sort the list by absolute value from largest to smallest*

`x`

`=`

`sorted([-4,1,-2,3],`

`key=abs,`

`reverse=True)`

# is [-4,3,-2,1]

*# sort the words and counts from highest count to lowest*

`wc`

`=`

`sorted(word_counts.items(),`

`key=`

lambda`(`

`word,`

`count):`

`count,`

` ``reverse=True)`

## List Comprehensions

`even_numbers`

`=`

`[`

`x`

for`x`

in`range(5)`

if`x`

`%`

`2`

`==`

`0]`

# [0, 2, 4]

`squares`

`=`

`[`

`x`

`*`

`x`

for`x`

in`range(5)]`

# [0, 1, 4, 9, 16]

`even_squares`

`=`

`[`

`x`

`*`

`x`

for`x`

in`even_numbers]`

# [0, 4, 16]

`square_dict`

`=`

`{`

`x`

`:`

`x`

`*`

`x`

for`x`

in`range(5)`

`}`

# { 0:0, 1:1, 2:4, 3:9, 4:16 }

`square_set`

`=`

`{`

`x`

`*`

`x`

for`x`

in`[`

`1,`

`-`

`1]`

`}`

# { 1 }

`zeroes`

`=`

`[`

`0`

for`_`

in`even_numbers]`

# has the same length as even_numbers

`pairs`

`=`

`[(`

`x,`

`y)`

for`x`

in`range(10)`

for`y`

in`range(10)]`

# 100 pairs (0,0) (0,1) ... (9,8), (9,9)

`increasing_pairs`

`=`

`[(`

`x,`

`y)`

# only pairs with x < y,

for`x`

in`range(10)`

# range(lo, hi) equals

for`y`

in`range(x`

`+`

`1,`

`10)]`

# [lo, lo + 1, ..., hi - 1]

## Generators and Iterators

def`lazy_range(n):`

` `*"""a lazy version of range"""*

`i`

`=`

`0`

while`i`

`<`

`n:`

yield`i`

`i`

`+=`

`1`

for`i`

in`lazy_range(10):`

` ``do_something_with(i)`

def`natural_numbers():`

` `*"""returns 1, 2, 3, ..."""*

`n`

`=`

`1`

while`True:`

yield`n`

`n`

`+=`

`1`

###### TIP

`lazy_evens_below_20`

`=`

`(`

`i`

for`i`

in`lazy_range(20)`

if`i`

`%`

`2`

`==`

`0)`

## Randomness

import

random

`four_uniform_randoms`

`=`

`[`

`random.random()`

for`_`

in`range(4)]`

*# [0.8444218515250481, # random.random() produces numbers*

*# 0.7579544029403025, # uniformly between 0 and 1*

*# 0.420571580830845, # it's the random function we'll use*

*# 0.25891675029296335] # most often*

`random.seed(10)`

# set the seed to 10

`random.random()`

# 0.57140259469

`random.seed(10)`

# reset the seed to 10

`random.random()`

# 0.57140259469 again

`random.randrange(10)`

# choose randomly from range(10) = [0, 1, ..., 9]

`random.randrange(3,`

`6)`

# choose randomly from range(3, 6) = [3, 4, 5]

`up_to_ten`

`=`

`range(10)`

`random.shuffle(up_to_ten)`

`up_to_ten`

*# [2, 5, 1, 9, 7, 3, 8, 6, 4, 0] (your results will probably be different)*

`my_best_friend`

`=`

`random.choice(["Alice",`

`"Bob",`

`"Charlie"])`

# "Bob" for me

`lottery_numbers`

`=`

`range(60)`

`winning_numbers`

`=`

`random.sample(lottery_numbers,`

`6)`

# [16, 36, 10, 6, 25, 9]

`four_with_replacement`

`=`

`[`

`random.choice(range(10))`

for`_`

in`range(4)]`

*# [9, 4, 4, 2]*

## Regular Expressions

import

re

`all([`

# all of these are true, because

not`re.match("a",`

`"cat"),`

# * 'cat' doesn't start with 'a'

`re.search("a",`

`"cat"),`

# * 'cat' has an 'a' in it

not`re.search("c",`

`"dog"),`

# * 'dog' doesn't have a 'c' in it

`3`

`==`

`len(re.split("[ab]",`

`"carbs")),`

# * split on a or b to ['c','r','s']

`"R-D-"`

`==`

`re.sub("[0-9]",`

`"-",`

`"R2D2")`

# * replace digits with dashes

`])`

# prints True

## Object-Oriented Programming

*# by convention, we give classes PascalCase names*

class

Set:

` `*# these are the member functions*

` `*# every one takes a first parameter "self" (another convention)*

` `*# that refers to the particular Set object being used*

def`__init__(self,`

`values=None):`

` `*"""This is the constructor.*

* It gets called when you create a new Set.*

* You would use it like*

* s1 = Set() # empty set*

* s2 = Set([1,2,2,3]) # initialize with values"""*

`self.dict`

`=`

`{}`

# each instance of Set has its own dict property

` `*# which is what we'll use to track memberships*

if`values`

is

not`None:`

for`value`

in`values:`

` ``self.add(value)`

def`__repr__(self):`

` `*"""this is the string representation of a Set object*

* if you type it at the Python prompt or pass it to str()"""*

return`"Set: "`

`+`

`str(self.dict.keys())`

` `*# we'll represent membership by being a key in self.dict with value True*

def`add(self,`

`value):`

`self.dict[value]`

`=`

`True`

` `*# value is in the Set if it's a key in the dictionary*

def`contains(self,`

`value):`

return`value`

in`self.dict`

def`remove(self,`

`value):`

del`self.dict[value]`

`s`

`=`

`Set([1,2,3])`

`s.add(4)`

`s.contains(4)`

# True

`s.remove(3)`

`s.contains(3)`

# False

## Functional Tools

def`exp(base,`

`power):`

return`base`

`**`

`power`

def`two_to_the(power):`

return`exp(2,`

`power)`

from

functools

import`partial`

`two_to_the`

`=`

`partial(exp,`

`2)`

# is now a function of one variable

`two_to_the(3)`

# 8

`square_of`

`=`

`partial(exp,`

`power=2)`

`square_of(3)`

# 9

def`double(x):`

return`2`

`*`

`x`

`xs`

`=`

`[`

`1,`

`2,`

`3,`

`4]`

`twice_xs`

`=`

`[`

`double(x)`

for`x`

in`xs]`

# [2, 4, 6, 8]

`twice_xs`

`=`

`map(double,`

`xs)`

# same as above

`list_doubler`

`=`

`partial(map,`

`double)`

# *function* that doubles a list

`twice_xs`

`=`

`list_doubler(xs)`

# again [2, 4, 6, 8]

def`multiply(x,`

`y):`

return`x`

`*`

`y`

`products`

`=`

`map(multiply,`

`[`

`1,`

`2],`

`[`

`4,`

`5])`

# [1 * 4, 2 * 5] = [4, 10]

def`is_even(x):`

` `*"""True if x is even, False if x is odd"""*

return`x`

`%`

`2`

`==`

`0`

`x_evens`

`=`

`[`

`x`

for`x`

in`xs`

if`is_even(x)]`

# [2, 4]

`x_evens`

`=`

`filter(is_even,`

`xs)`

# same as above

`listvener`

`=`

`partial(filter,`

`is_even)`

# *function* that filters a list

`x_evens`

`=`

`listvener(xs)`

# again [2, 4]

`x_product`

`=`

`reduce(multiply,`

`xs)`

# = 1 * 2 * 3 * 4 = 24

`list_product`

`=`

`partial(reduce,`

`multiply)`

# *function* that reduces a list

`x_product`

`=`

`list_product(xs)`

# again = 24

## enumerate

*# not Pythonic*

for`i`

in`range(len(documents)):`

`document`

`=`

`documents[i]`

`do_something(i,`

`document)`

*# also not Pythonic*

`i`

`=`

`0`

for`document`

in`documents:`

`do_something(i,`

`document)`

`i`

`+=`

`1`

for`i,`

`document`

in`enumerate(documents):`

`do_something(i,`

`document)`

for`i`

in`range(len(documents)):`

`do_something(i)`

# not Pythonic

for`i,`

`_`

in`enumerate(documents):`

`do_something(i)`

# Pythonic

## zip and Argument Unpacking

`list1`

`=`

`[`

`'a',`

`'b',`

`'c']`

`list2`

`=`

`[`

`1,`

`2,`

`3]`

`zip(list1,`

`list2)`

# is [('a', 1), ('b', 2), ('c', 3)]

`pairs`

`=`

`[(`

`'a',`

`1),`

`(`

`'b',`

`2),`

`(`

`'c',`

`3)]`

`letters,`

`numbers`

`=`

`zip(*pairs)`

`zip(('a',`

`1),`

`(`

`'b',`

`2),`

`(`

`'c',`

`3))`

def`add(a,`

`b):`

return`a`

`+`

`b`

`add(1,`

`2)`

# returns 3

`add([1,`

`2])`

# TypeError!

`add(*[1,`

`2])`

# returns 3

## args and kwargs

def`doubler(f):`

def`g(x):`

return`2`

`*`

`f(x)`

return`g`

def`f1(x):`

return`x`

`+`

`1`

`g`

`=`

`doubler(f1)`

`g(3)`

# 8 (== ( 3 + 1) * 2)

`g(-1)`

# 0 (== (-1 + 1) * 2)

def`f2(x,`

`y):`

return`x`

`+`

`y`

`g`

`=`

`doubler(f2)`

`g(1,`

`2)`

# TypeError: g() takes exactly 1 argument (2 given)

def`magic(*args,`

`**`

`kwargs):`

`"unnamed args:",`

`args`

`"keyword args:",`

`kwargs`

`magic(1,`

`2,`

`key="word",`

`key2="word2")`

*# prints*

*# unnamed args: (1, 2)*

*# keyword args: {'key2': 'word2', 'key': 'word'}*

def`other_way_magic(x,`

`y,`

`z):`

return`x`

`+`

`y`

`+`

`z`

`x_y_list`

`=`

`[`

`1,`

`2]`

`z_dict`

`=`

`{`

`"z"`

`:`

`3`

`}`

`other_way_magic(*x_y_list,`

`**`

`z_dict)`

# 6

def`doubler_correct(f):`

` `*"""works no matter what kind of inputs f expects"""*

def`g(*args,`

`**`

`kwargs):`

` `*"""whatever arguments g is supplied, pass them through to f"""*

return`2`

`*`

`f(*args,`

`**`

`kwargs)`

return`g`

`g`

`=`

`doubler_correct(f2)`

`g(1,`

`2)`

# 6

## Welcome to DataSciencester!

§ There is no shortage of Python tutorials in the world. The official one is not a bad place to start.

§ The official IPython tutorial is not quite as good. You might be better off with their videos and presentations. Alternatively, Wes McKinney’s *Python for Data Analysis* (O’Reilly) has a really good IPython chapter.