﻿ ﻿Decision Trees - Data Science from Scratch: First Principles with Python (2015)

# Data Science from Scratch: First Principles with Python (2015)

### Chapter 17.Decision Trees

A tree is an incomprehensible mystery.

Jim Woodring

# What Is a Decision Tree?

§ “I am thinking of an animal.”

§ “Does it have more than five legs?”

§ “No.”

§ “Is it delicious?”

§ “No.”

§ “Does it appear on the back of the Australian five-cent coin?”

§ “Yes.”

§ “Is it an echidna?”

§ “Yes, it is!”

# Entropy

def entropy(class_probabilities):
"""given a list of class probabilities, compute the entropy"""
return sum(-p * math.log(p, 2)
for p in class_probabilities
if p)                         # ignore zero probabilities
def class_probabilities(labels):
total_count = len(labels)
return [count / total_count
for count in Counter(labels).values()]

def data_entropy(labeled_data):
labels = [label for _, label in labeled_data]
probabilities = class_probabilities(labels)
return entropy(probabilities)

# The Entropy of a Partition

def partition_entropy(subsets):
"""find the entropy from this partition of data into subsets
subsets is a list of lists of labeled data"""

total_count = sum(len(subset) for subset in subsets)

return sum( data_entropy(subset) * len(subset) / total_count
for subset in subsets )
###### NOTE
Creating a Decision Tree
inputs = [
({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'no'},    False),
({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'yes'},   False),
({'level':'Mid', 'lang':'Python', 'tweets':'no', 'phd':'no'},      True),
({'level':'Junior', 'lang':'Python', 'tweets':'no', 'phd':'no'},   True),
({'level':'Junior', 'lang':'R', 'tweets':'yes', 'phd':'no'},       True),
({'level':'Junior', 'lang':'R', 'tweets':'yes', 'phd':'yes'},     False),
({'level':'Mid', 'lang':'R', 'tweets':'yes', 'phd':'yes'},         True),
({'level':'Senior', 'lang':'Python', 'tweets':'no', 'phd':'no'},  False),
({'level':'Senior', 'lang':'R', 'tweets':'yes', 'phd':'no'},       True),
({'level':'Junior', 'lang':'Python', 'tweets':'yes', 'phd':'no'},  True),
({'level':'Senior', 'lang':'Python', 'tweets':'yes', 'phd':'yes'}, True),
({'level':'Mid', 'lang':'Python', 'tweets':'no', 'phd':'yes'},     True),
({'level':'Mid', 'lang':'Java', 'tweets':'yes', 'phd':'no'},       True),
({'level':'Junior', 'lang':'Python', 'tweets':'no', 'phd':'yes'}, False)
]

§ If the data all have the same label, then create a leaf node that predicts that label and then stop.

§ If the list of attributes is empty (i.e., there are no more possible questions to ask), then create a leaf node that predicts the most common label and then stop.

§ Otherwise, try partitioning the data by each of the attributes

§ Choose the partition with the lowest partition entropy

§ Add a decision node based on the chosen attribute

§ Recur on each partitioned subset using the remaining attributes

def partition_by(inputs, attribute):
"""each input is a pair (attribute_dict, label).
returns a dict : attribute_value -> inputs"""
groups = defaultdict(list)
for input in inputs:
key = input[0][attribute]   # get the value of the specified attribute
groups[key].append(input)   # then add this input to the correct list
return groups
def partition_entropy_by(inputs, attribute):
"""computes the entropy corresponding to the given partition"""
partitions = partition_by(inputs, attribute)
return partition_entropy(partitions.values())
for key in ['level','lang','tweets','phd']:
print key, partition_entropy_by(inputs, key)

# level 0.693536138896
# lang 0.860131712855
# tweets 0.788450457308
# phd 0.892158928262
senior_inputs = [(input, label)
for input, label in inputs if input["level"] == "Senior"]

for key in ['lang', 'tweets', 'phd']:
print key, partition_entropy_by(senior_inputs, key)

# lang 0.4
# tweets 0.0
# phd 0.950977500433
Putting It All Together

§ True

§ False

§ a tuple (attribute, subtree_dict)

('level',
{'Junior': ('phd', {'no': True, 'yes': False}),
'Mid': True,
'Senior': ('tweets', {'no': False, 'yes': True})})
def classify(tree, input):
"""classify the input using the given decision tree"""

# if this is a leaf node, return its value
if tree in [True, False]:
return tree

# otherwise this tree consists of an attribute to split on
# and a dictionary whose keys are values of that attribute
# and whose values of are subtrees to consider next
attribute, subtree_dict = tree

subtree_key = input.get(attribute)    # None if input is missing attribute

if subtree_key not in subtree_dict:   # if no subtree for key,
subtree_key = None                # we'll use the None subtree

subtree = subtree_dict[subtree_key]   # choose the appropriate subtree
return classify(subtree, input)       # and use it to classify the input
def build_tree_id3(inputs, split_candidates=None):

# if this is our first pass,
# all keys of the first input are split candidates
if split_candidates is None:
split_candidates = inputs[0][0].keys()

# count Trues and Falses in the inputs
num_inputs = len(inputs)
num_trues = len([label for item, label in inputs if label])
num_falses = num_inputs - num_trues

if num_trues == 0: return False     # no Trues? return a "False" leaf
if num_falses == 0: return True     # no Falses? return a "True" leaf

if not split_candidates:            # if no split candidates left
return num_trues >= num_falses  # return the majority leaf

# otherwise, split on the best attribute
best_attribute = min(split_candidates,
key=partial(partition_entropy_by, inputs))

partitions = partition_by(inputs, best_attribute)
new_candidates = [a for a in split_candidates
if a != best_attribute]

# recursively build the subtrees
subtrees = { attribute_value : build_tree_id3(subset, new_candidates)
for attribute_value, subset in partitions.iteritems() }

subtrees[None] = num_trues > num_falses      # default case

return (best_attribute, subtrees)
tree = build_tree_id3(inputs)

classify(tree, { "level" : "Junior",
"lang" : "Java",
"tweets" : "yes",
"phd" : "no"} )        # True

classify(tree, { "level" : "Junior",
"lang" : "Java",
"tweets" : "yes",
"phd" : "yes"} )       # False
classify(tree, { "level" : "Intern" } ) # True
classify(tree, { "level" : "Senior" } ) # False
###### NOTE
Random Forests
def forest_classify(trees, input):
votes = [classify(tree, input) for tree in trees]
return vote_counts.most_common(1)[0][0]
# if there's already few enough split candidates, look at all of them
if len(split_candidates) <= self.num_split_candidates:
sampled_split_candidates = split_candidates
# otherwise pick a random sample
else:
sampled_split_candidates = random.sample(split_candidates,
self.num_split_candidates)

# now choose the best attribute only from those candidates
best_attribute = min(sampled_split_candidates,
key=partial(partition_entropy_by, inputs))

partitions = partition_by(inputs, best_attribute)

# For Further Exploration

§ scikit-learn has many Decision Tree models. It also has an ensemble module that includes a RandomForestClassifier as well as other ensemble methods.

§ We barely scratched the surface of decision trees and their algorithms. Wikipedia is a good starting point for broader exploration.

﻿