Data Science from Scratch: First Principles with Python (2015)
Chapter 2. A Crash Course in Python
People are still crazy about Python after twenty-five years, which I find hard to believe.
Michael Palin
The Basics
Getting Python
pip install ipython
The Zen of Python
There should be one — and preferably only one — obvious way to do it.
Whitespace Formatting
for
i
in
[
1,
2,
3,
4,
5]:
i
# first line in "for i" block
for
j
in
[
1,
2,
3,
4,
5]:
j
# first line in "for j" block
i
+
j
# last line in "for j" block
i
# last line in "for i" block
"done looping"
long_winded_computation
=
(
1
+
2
+
3
+
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20)
list_of_lists
=
[[
1,
2,
3],
[
4,
5,
6],
[
7,
8,
9]]
easier_to_read_list_of_lists
=
[
[
1,
2,
3],
[
4,
5,
6],
[
7,
8,
9]
]
two_plus_three
=
2
+
\
3
for
i
in
[
1,
2,
3,
4,
5]:
# notice the blank line
i
IndentationError:
expected
an
indented
block
Modules
import
re
my_regex
=
re.compile("[0-9]+",
re.I)
import
re
as
regex
my_regex
=
regex.compile("[0-9]+",
regex.I)
import
matplotlib.pyplot
as
plt
from
collections
import
defaultdict,
Counter
lookup
=
defaultdict(int)
my_counter
=
Counter()
match
=
10
from
re
import
*
# uh oh, re has a match function
match
# "<function re.match>"
Arithmetic
from
__future__
import
division
Functions
def
double(x):
"""this is where you put an optional docstring
that explains what the function does.
for example, this function multiplies its input by 2"""
return
x
*
2
def
apply_to_one(f):
"""calls the function f with 1 as its argument"""
return
f(1)
my_double
=
double
# refers to the previously defined function
x
=
apply_to_one(my_double)
# equals 2
y
=
apply_to_one(lambda
x:
x
+
4)
# equals 5
another_double
=
lambda
x:
2
*
x
# don't do this
def
another_double(x):
return
2
*
x
# do this instead
def
my_print(message="my default message"):
message
my_print("hello")
# prints 'hello'
my_print()
# prints 'my default message'
def
subtract(a=0,
b=0):
return
a
-
b
subtract(10,
5)
# returns 5
subtract(0,
5)
# returns -5
subtract(b=5)
# same as previous
Strings
single_quoted_string
=
'data science'
double_quoted_string
=
"data science"
tab_string
=
"\t"
# represents the tab character
len(tab_string)
# is 1
not_tab_string
=
r"\t"
# represents the characters '\' and 't'
len(not_tab_string)
# is 2
multi_line_string
=
"""This is the first line.
and this is the second line
and this is the third line"""
Exceptions
try:
0
/
0
except
ZeroDivisionError:
"cannot divide by zero"
Lists
integer_list
=
[
1,
2,
3]
heterogeneous_list
=
[
"string",
0.1,
True]
list_of_lists
=
[
integer_list,
heterogeneous_list,
[]
]
list_length
=
len(integer_list)
# equals 3
listum
=
sum(integer_list)
# equals 6
x
=
range(10)
# is the list [0, 1, ..., 9]
zero
=
x[0]
# equals 0, lists are 0-indexed
one
=
x[1]
# equals 1
nine
=
x[-1]
# equals 9, 'Pythonic' for last element
eight
=
x[-2]
# equals 8, 'Pythonic' for next-to-last element
x[0]
=
-
1
# now x is [-1, 1, 2, 3, ..., 9]
first_three
=
x[:3]
# [-1, 1, 2]
three_to_end
=
x[3:]
# [3, 4, ..., 9]
one_to_four
=
x[1:5]
# [1, 2, 3, 4]
last_three
=
x[-3:]
# [7, 8, 9]
without_first_and_last
=
x[1:-1]
# [1, 2, ..., 8]
copy_of_x
=
x[:]
# [-1, 1, 2, ..., 9]
1
in
[
1,
2,
3]
# True
0
in
[
1,
2,
3]
# False
x
=
[
1,
2,
3]
x.extend([4,
5,
6])
# x is now [1,2,3,4,5,6]
x
=
[
1,
2,
3]
y
=
x
+
[
4,
5,
6]
# y is [1, 2, 3, 4, 5, 6]; x is unchanged
x
=
[
1,
2,
3]
x.append(0)
# x is now [1, 2, 3, 0]
y
=
x[-1]
# equals 0
z
=
len(x)
# equals 4
x,
y
=
[
1,
2]
# now x is 1, y is 2
_,
y
=
[
1,
2]
# now y == 2, didn't care about the first element
Tuples
my_list
=
[
1,
2]
my_tuple
=
(
1,
2)
other_tuple
=
3,
4
my_list[1]
=
3
# my_list is now [1, 3]
try:
my_tuple[1]
=
3
except
TypeError:
"cannot modify a tuple"
def
sum_and_product(x,
y):
return
(
x
+
y),(x
*
y)
sp
=
sum_and_product(2,
3)
# equals (5, 6)
s,
p
=
sum_and_product(5,
10)
# s is 15, p is 50
x,
y
=
1,
2
# now x is 1, y is 2
x,
y
=
y,
x
# Pythonic way to swap variables; now x is 2, y is 1
Dictionaries
empty_dict
=
{}
# Pythonic
empty_dict2
=
dict()
# less Pythonic
grades
=
{
"Joel"
:
80,
"Tim"
:
95
}
# dictionary literal
joels_grade
=
grades["Joel"]
# equals 80
try:
kates_grade
=
grades["Kate"]
except
KeyError:
"no grade for Kate!"
joel_has_grade
=
"Joel"
in
grades
# True
kate_has_grade
=
"Kate"
in
grades
# False
joels_grade
=
grades.get("Joel",
0)
# equals 80
kates_grade
=
grades.get("Kate",
0)
# equals 0
no_ones_grade
=
grades.get("No One")
# default default is None
grades["Tim"]
=
99
# replaces the old value
grades["Kate"]
=
100
# adds a third entry
num_students
=
len(grades)
# equals 3
tweet
=
{
"user"
:
"joelgrus",
"text"
:
"Data Science is Awesome",
"retweet_count"
:
100,
"hashtags"
:
[
"#data",
"#science",
"#datascience",
"#awesome",
"#yolo"]
}
tweet_keys
=
tweet.keys()
# list of keys
tweet_values
=
tweet.values()
# list of values
tweet_items
=
tweet.items()
# list of (key, value) tuples
"user"
in
tweet_keys
# True, but uses a slow list in
"user"
in
tweet
# more Pythonic, uses faster dict in
"joelgrus"
in
tweet_values
# True
defaultdict
word_counts
=
{}
for
word
in
document:
if
word
in
word_counts:
word_counts[word]
+=
1
else:
word_counts[word]
=
1
word_counts
=
{}
for
word
in
document:
try:
word_counts[word]
+=
1
except
KeyError:
word_counts[word]
=
1
word_counts
=
{}
for
word
in
document:
previous_count
=
word_counts.get(word,
0)
word_counts[word]
=
previous_count
+
1
from
collections
import
defaultdict
word_counts
=
defaultdict(int)
# int() produces 0
for
word
in
document:
word_counts[word]
+=
1
dd_list
=
defaultdict(list)
# list() produces an empty list
dd_list[2].append(1)
# now dd_list contains {2: [1]}
dd_dict
=
defaultdict(dict)
# dict() produces an empty dict
dd_dict["Joel"]["City"]
=
"Seattle"
# { "Joel" : { "City" : Seattle"}}
dd_pair
=
defaultdict(lambda:
[
0,
0])
dd_pair[2][1]
=
1
# now dd_pair contains {2: [0,1]}
from
collections
import
Counter
c
=
Counter([0,
1,
2,
0])
# c is (basically) { 0 : 2, 1 : 1, 2 : 1 }
word_counts
=
Counter(document)
# print the 10 most common words and their counts
for
word,
count
in
word_counts.most_common(10):
word,
count
s
=
set()
s.add(1)
# s is now { 1 }
s.add(2)
# s is now { 1, 2 }
s.add(2)
# s is still { 1, 2 }
x
=
len(s)
# equals 2
y
=
2
in
s
# equals True
z
=
3
in
s
# equals False
stopwords_list
=
[
"a","an","at"]
+
hundreds_of_other_words
+
[
"yet",
"you"]
"zip"
in
stopwords_list
# False, but have to check every element
stopwords_set
=
set(stopwords_list)
"zip"
in
stopwords_set
# very fast to check
item_list
=
[
1,
2,
3,
1,
2,
3]
num_items
=
len(item_list)
# 6
item_set
=
set(item_list)
# {1, 2, 3}
num_distinct_items
=
len(item_set)
# 3
distinct_item_list
=
list(item_set)
# [1, 2, 3]
Control Flow
if
1
>
2:
message
=
"if only 1 were greater than two..."
elif
1
>
3:
message
=
"elif stands for 'else if'"
else:
message
=
"when all else fails use else (if you want to)"
parity
=
"even"
if
x
%
2
==
0
else
"odd"
x
=
0
while
x
<
10:
x,
"is less than 10"
x
+=
1
for
x
in
range(10):
x,
"is less than 10"
for
x
in
range(10):
if
x
==
3:
continue
# go immediately to the next iteration
if
x
==
5:
break
# quit the loop entirely
x
Truthiness
one_is_less_than_two
=
1
<
2
# equals True
true_equals_false
=
True
==
False
# equals False
x
=
None
x
==
None
# prints True, but is not Pythonic
x
is
None
# prints True, and is Pythonic
§ False
§ None
§ []
(an empty list
)
§ {}
(an empty dict
)
§ ""
§ set()
§ 0
§ 0.0
if
s:
first_char
=
s[0]
else:
first_char
=
""
first_char
=
s
and
s[0]
safe_x
=
x
or
0
all([True,
1,
{
3
}])
# True
all([True,
1,
{}])
# False, {} is falsy
any([True,
1,
{}])
# True, True is truthy
all([])
# True, no falsy elements in the list
any([])
# False, no truthy elements in the list
x
=
[
4,1,2,3]
y
=
sorted(x)
# is [1,2,3,4], x is unchanged
x.sort()
# now x is [1,2,3,4]
# sort the list by absolute value from largest to smallest
x
=
sorted([-4,1,-2,3],
key=abs,
reverse=True)
# is [-4,3,-2,1]
# sort the words and counts from highest count to lowest
wc
=
sorted(word_counts.items(),
key=lambda
(
word,
count):
count,
reverse=True)
List Comprehensions
even_numbers
=
[
x
for
x
in
range(5)
if
x
%
2
==
0]
# [0, 2, 4]
squares
=
[
x
*
x
for
x
in
range(5)]
# [0, 1, 4, 9, 16]
even_squares
=
[
x
*
x
for
x
in
even_numbers]
# [0, 4, 16]
square_dict
=
{
x
:
x
*
x
for
x
in
range(5)
}
# { 0:0, 1:1, 2:4, 3:9, 4:16 }
square_set
=
{
x
*
x
for
x
in
[
1,
-
1]
}
# { 1 }
zeroes
=
[
0
for
_
in
even_numbers]
# has the same length as even_numbers
pairs
=
[(
x,
y)
for
x
in
range(10)
for
y
in
range(10)]
# 100 pairs (0,0) (0,1) ... (9,8), (9,9)
increasing_pairs
=
[(
x,
y)
# only pairs with x < y,
for
x
in
range(10)
# range(lo, hi) equals
for
y
in
range(x
+
1,
10)]
# [lo, lo + 1, ..., hi - 1]
Generators and Iterators
def
lazy_range(n):
"""a lazy version of range"""
i
=
0
while
i
<
n:
yield
i
i
+=
1
for
i
in
lazy_range(10):
do_something_with(i)
def
natural_numbers():
"""returns 1, 2, 3, ..."""
n
=
1
while
True:
yield
n
n
+=
1
TIP
lazy_evens_below_20
=
(
i
for
i
in
lazy_range(20)
if
i
%
2
==
0)
Randomness
import
random
four_uniform_randoms
=
[
random.random()
for
_
in
range(4)]
# [0.8444218515250481, # random.random() produces numbers
# 0.7579544029403025, # uniformly between 0 and 1
# 0.420571580830845, # it's the random function we'll use
# 0.25891675029296335] # most often
random.seed(10)
# set the seed to 10
random.random()
# 0.57140259469
random.seed(10)
# reset the seed to 10
random.random()
# 0.57140259469 again
random.randrange(10)
# choose randomly from range(10) = [0, 1, ..., 9]
random.randrange(3,
6)
# choose randomly from range(3, 6) = [3, 4, 5]
up_to_ten
=
range(10)
random.shuffle(up_to_ten)
up_to_ten
# [2, 5, 1, 9, 7, 3, 8, 6, 4, 0] (your results will probably be different)
my_best_friend
=
random.choice(["Alice",
"Bob",
"Charlie"])
# "Bob" for me
lottery_numbers
=
range(60)
winning_numbers
=
random.sample(lottery_numbers,
6)
# [16, 36, 10, 6, 25, 9]
four_with_replacement
=
[
random.choice(range(10))
for
_
in
range(4)]
# [9, 4, 4, 2]
Regular Expressions
import
re
all([
# all of these are true, because
not
re.match("a",
"cat"),
# * 'cat' doesn't start with 'a'
re.search("a",
"cat"),
# * 'cat' has an 'a' in it
not
re.search("c",
"dog"),
# * 'dog' doesn't have a 'c' in it
3
==
len(re.split("[ab]",
"carbs")),
# * split on a or b to ['c','r','s']
"R-D-"
==
re.sub("[0-9]",
"-",
"R2D2")
# * replace digits with dashes
])
# prints True
Object-Oriented Programming
# by convention, we give classes PascalCase names
class
Set:
# these are the member functions
# every one takes a first parameter "self" (another convention)
# that refers to the particular Set object being used
def
__init__(self,
values=None):
"""This is the constructor.
It gets called when you create a new Set.
You would use it like
s1 = Set() # empty set
s2 = Set([1,2,2,3]) # initialize with values"""
self.dict
=
{}
# each instance of Set has its own dict property
# which is what we'll use to track memberships
if
values
is
not
None:
for
value
in
values:
self.add(value)
def
__repr__(self):
"""this is the string representation of a Set object
if you type it at the Python prompt or pass it to str()"""
return
"Set: "
+
str(self.dict.keys())
# we'll represent membership by being a key in self.dict with value True
def
add(self,
value):
self.dict[value]
=
True
# value is in the Set if it's a key in the dictionary
def
contains(self,
value):
return
value
in
self.dict
def
remove(self,
value):
del
self.dict[value]
s
=
Set([1,2,3])
s.add(4)
s.contains(4)
# True
s.remove(3)
s.contains(3)
# False
Functional Tools
def
exp(base,
power):
return
base
**
power
def
two_to_the(power):
return
exp(2,
power)
from
functools
import
partial
two_to_the
=
partial(exp,
2)
# is now a function of one variable
two_to_the(3)
# 8
square_of
=
partial(exp,
power=2)
square_of(3)
# 9
def
double(x):
return
2
*
x
xs
=
[
1,
2,
3,
4]
twice_xs
=
[
double(x)
for
x
in
xs]
# [2, 4, 6, 8]
twice_xs
=
map(double,
xs)
# same as above
list_doubler
=
partial(map,
double)
# *function* that doubles a list
twice_xs
=
list_doubler(xs)
# again [2, 4, 6, 8]
def
multiply(x,
y):
return
x
*
y
products
=
map(multiply,
[
1,
2],
[
4,
5])
# [1 * 4, 2 * 5] = [4, 10]
def
is_even(x):
"""True if x is even, False if x is odd"""
return
x
%
2
==
0
x_evens
=
[
x
for
x
in
xs
if
is_even(x)]
# [2, 4]
x_evens
=
filter(is_even,
xs)
# same as above
listvener
=
partial(filter,
is_even)
# *function* that filters a list
x_evens
=
listvener(xs)
# again [2, 4]
x_product
=
reduce(multiply,
xs)
# = 1 * 2 * 3 * 4 = 24
list_product
=
partial(reduce,
multiply)
# *function* that reduces a list
x_product
=
list_product(xs)
# again = 24
enumerate
# not Pythonic
for
i
in
range(len(documents)):
document
=
documents[i]
do_something(i,
document)
# also not Pythonic
i
=
0
for
document
in
documents:
do_something(i,
document)
i
+=
1
for
i,
document
in
enumerate(documents):
do_something(i,
document)
for
i
in
range(len(documents)):
do_something(i)
# not Pythonic
for
i,
_
in
enumerate(documents):
do_something(i)
# Pythonic
zip and Argument Unpacking
list1
=
[
'a',
'b',
'c']
list2
=
[
1,
2,
3]
zip(list1,
list2)
# is [('a', 1), ('b', 2), ('c', 3)]
pairs
=
[(
'a',
1),
(
'b',
2),
(
'c',
3)]
letters,
numbers
=
zip(*pairs)
zip(('a',
1),
(
'b',
2),
(
'c',
3))
def
add(a,
b):
return
a
+
b
add(1,
2)
# returns 3
add([1,
2])
# TypeError!
add(*[1,
2])
# returns 3
args and kwargs
def
doubler(f):
def
g(x):
return
2
*
f(x)
return
g
def
f1(x):
return
x
+
1
g
=
doubler(f1)
g(3)
# 8 (== ( 3 + 1) * 2)
g(-1)
# 0 (== (-1 + 1) * 2)
def
f2(x,
y):
return
x
+
y
g
=
doubler(f2)
g(1,
2)
# TypeError: g() takes exactly 1 argument (2 given)
def
magic(*args,
**
kwargs):
"unnamed args:",
args
"keyword args:",
kwargs
magic(1,
2,
key="word",
key2="word2")
# prints
# unnamed args: (1, 2)
# keyword args: {'key2': 'word2', 'key': 'word'}
def
other_way_magic(x,
y,
z):
return
x
+
y
+
z
x_y_list
=
[
1,
2]
z_dict
=
{
"z"
:
3
}
other_way_magic(*x_y_list,
**
z_dict)
# 6
def
doubler_correct(f):
"""works no matter what kind of inputs f expects"""
def
g(*args,
**
kwargs):
"""whatever arguments g is supplied, pass them through to f"""
return
2
*
f(*args,
**
kwargs)
return
g
g
=
doubler_correct(f2)
g(1,
2)
# 6
Welcome to DataSciencester!
§ There is no shortage of Python tutorials in the world. The official one is not a bad place to start.
§ The official IPython tutorial is not quite as good. You might be better off with their videos and presentations. Alternatively, Wes McKinney’s Python for Data Analysis (O’Reilly) has a really good IPython chapter.