Spaces:
Sleeping
Sleeping
Commit
·
46a9203
1
Parent(s):
67098ed
other file
Browse files- __pycache__/tree_decision.cpython-39.pyc +0 -0
- tree.pkl +3 -0
- tree_decision.py +82 -0
__pycache__/tree_decision.cpython-39.pyc
ADDED
Binary file (3 kB). View file
|
|
tree.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:008b57f8c2a42cb84690bca33c476c93be99c4c53fc3354fdcbb1a6f7268d7f3
|
3 |
+
size 1092
|
tree_decision.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
def compute_gini(y):
|
5 |
+
m = len(y)
|
6 |
+
return 1 - sum((np.bincount(y.astype(int)) / m) ** 2)
|
7 |
+
|
8 |
+
def split_node(feature, y):
|
9 |
+
m = len(y)
|
10 |
+
best_gini = float("inf")
|
11 |
+
best_average = None
|
12 |
+
feature_sorted = np.sort(feature)
|
13 |
+
for index in range(m - 1):
|
14 |
+
average = (feature_sorted[index] + feature_sorted[index + 1]) / 2
|
15 |
+
y_left = y[feature <= average]
|
16 |
+
y_right = y[feature > average]
|
17 |
+
gini_left = compute_gini(y_left)
|
18 |
+
gini_right = compute_gini(y_right)
|
19 |
+
gini = (len(y_left) / m) * gini_left + (len(y_right) / m) * gini_right
|
20 |
+
if gini < best_gini:
|
21 |
+
best_gini = gini
|
22 |
+
best_average = average
|
23 |
+
return best_average, best_gini
|
24 |
+
|
25 |
+
class Node:
|
26 |
+
def __init__(self, feature=None, branch=None, value=None):
|
27 |
+
self.feature = feature
|
28 |
+
self.branch = branch
|
29 |
+
self.node_children = []
|
30 |
+
self.is_leaf = False
|
31 |
+
self.value = value
|
32 |
+
|
33 |
+
def __str__(self):
|
34 |
+
return f"Feature: {self.feature}, Branch: {self.branch}, Value: {self.value}, Leaf: {self.is_leaf}"
|
35 |
+
|
36 |
+
def add_child(self, node):
|
37 |
+
self.node_children.append(node)
|
38 |
+
|
39 |
+
def set_leaf(self, value):
|
40 |
+
self.is_leaf = value
|
41 |
+
|
42 |
+
def search(self, x_dict):
|
43 |
+
if self.is_leaf:
|
44 |
+
return self.value
|
45 |
+
if x_dict[self.feature] < self.branch:
|
46 |
+
return self.node_children[0].search(x_dict)
|
47 |
+
else:
|
48 |
+
return self.node_children[1].search(x_dict)
|
49 |
+
|
50 |
+
def construct_decision_tree(x, y, feature_names):
|
51 |
+
if len(np.unique(y)) == 1:
|
52 |
+
leaf = Node(value=y[0])
|
53 |
+
leaf.set_leaf(True)
|
54 |
+
return leaf
|
55 |
+
if feature_names.size == 0:
|
56 |
+
leaf = Node(value=np.bincount(y.astype(int)).argmax())
|
57 |
+
leaf.set_leaf(True)
|
58 |
+
return leaf
|
59 |
+
|
60 |
+
split_values_gini = [split_node(x[:, i], y) for i in range(x.shape[1])]
|
61 |
+
best_feature_index = np.argmin([g for _, g in split_values_gini])
|
62 |
+
split_value = split_values_gini[best_feature_index][0]
|
63 |
+
feature_name = feature_names[best_feature_index]
|
64 |
+
|
65 |
+
x_left, y_left, x_right, y_right = [], [], [], []
|
66 |
+
for i in range(len(y)):
|
67 |
+
row = x[i]
|
68 |
+
if row[best_feature_index] <= split_value:
|
69 |
+
x_left.append(row)
|
70 |
+
y_left.append(y[i])
|
71 |
+
else:
|
72 |
+
x_right.append(row)
|
73 |
+
y_right.append(y[i])
|
74 |
+
|
75 |
+
x_left, y_left = np.array(x_left), np.array(y_left)
|
76 |
+
x_right, y_right = np.array(x_right), np.array(y_right)
|
77 |
+
|
78 |
+
node = Node(feature=feature_name, branch=split_value)
|
79 |
+
node.add_child(construct_decision_tree(x_left, y_left, feature_names))
|
80 |
+
node.add_child(construct_decision_tree(x_right, y_right, feature_names))
|
81 |
+
|
82 |
+
return node
|