-
Notifications
You must be signed in to change notification settings - Fork 122
/
Copy pathid3.py
126 lines (111 loc) · 4.67 KB
/
id3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from __future__ import print_function
import numpy as np
import pandas as pd
class TreeNode(object):
def __init__(self, ids = None, children = [], entropy = 0, depth = 0):
self.ids = ids # index of data in this node
self.entropy = entropy # entropy, will fill later
self.depth = depth # distance to root node
self.split_attribute = None # which attribute is chosen, it non-leaf
self.children = children # list of its child nodes
self.order = None # order of values of split_attribute in children
self.label = None # label of node if it is a leaf
def set_properties(self, split_attribute, order):
self.split_attribute = split_attribute
self.order = order
def set_label(self, label):
self.label = label
def entropy(freq):
# remove prob 0
freq_0 = freq[np.array(freq).nonzero()[0]]
prob_0 = freq_0/float(freq_0.sum())
return -np.sum(prob_0*np.log(prob_0))
class DecisionTreeID3(object):
def __init__(self, max_depth= 10, min_samples_split = 2, min_gain = 1e-4):
self.root = None
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.Ntrain = 0
self.min_gain = min_gain
def fit(self, data, target):
self.Ntrain = data.count()[0]
self.data = data
self.attributes = list(data)
self.target = target
self.labels = target.unique()
ids = range(self.Ntrain)
self.root = TreeNode(ids = ids, entropy = self._entropy(ids), depth = 0)
queue = [self.root]
while queue:
node = queue.pop()
if node.depth < self.max_depth or node.entropy < self.min_gain:
node.children = self._split(node)
if not node.children: #leaf node
self._set_label(node)
queue += node.children
else:
self._set_label(node)
def _entropy(self, ids):
# calculate entropy of a node with index ids
if len(ids) == 0: return 0
ids = [i+1 for i in ids] # panda series index starts from 1
freq = np.array(self.target[ids].value_counts())
return entropy(freq)
def _set_label(self, node):
# find label for a node if it is a leaf
# simply chose by major voting
target_ids = [i + 1 for i in node.ids] # target is a series variable
node.set_label(self.target[target_ids].mode()[0]) # most frequent label
def _split(self, node):
ids = node.ids
best_gain = 0
best_splits = []
best_attribute = None
order = None
sub_data = self.data.iloc[ids, :]
for i, att in enumerate(self.attributes):
values = self.data.iloc[ids, i].unique().tolist()
if len(values) == 1: continue # entropy = 0
splits = []
for val in values:
sub_ids = sub_data.index[sub_data[att] == val].tolist()
splits.append([sub_id-1 for sub_id in sub_ids])
# don't split if a node has too small number of points
if min(map(len, splits)) < self.min_samples_split: continue
# information gain
HxS= 0
for split in splits:
HxS += len(split)*self._entropy(split)/len(ids)
gain = node.entropy - HxS
if gain < self.min_gain: continue # stop if small gain
if gain > best_gain:
best_gain = gain
best_splits = splits
best_attribute = att
order = values
node.set_properties(best_attribute, order)
child_nodes = [TreeNode(ids = split,
entropy = self._entropy(split), depth = node.depth + 1) for split in best_splits]
return child_nodes
def predict(self, new_data):
"""
:param new_data: a new dataframe, each row is a datapoint
:return: predicted labels for each row
"""
npoints = new_data.count()[0]
labels = [None]*npoints
for n in range(npoints):
x = new_data.iloc[n, :] # one point
# start from root and recursively travel if not meet a leaf
node = self.root
while node.children:
node = node.children[node.order.index(x[node.split_attribute])]
labels[n] = node.label
return labels
if __name__ == "__main__":
df = pd.read_csv('weather.csv', index_col = 0, parse_dates = True)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
tree = DecisionTreeID3(max_depth = 3, min_samples_split = 2)
tree.fit(X, y)
print(tree.predict(X))