Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- # My functions
- def variance_reduction(node_var, node_cardinality, child_groups):
- # After we split the data based on the each candidate split-point
- # of each attribute, we return the bucketed y-values, as a tuple.
- left, right = child_groups[0], child_groups[1]
- left_var = np.var(left)
- left_size = len(left)
- right_var = np.var(right)
- right_size = len(right)
- vr = node_var - (left_size * left_var + right_size * right_var) / node_cardinality
- return vr
- # My functions
- def generate_test_data(N):
- # np.random.seed(1)
- x = np.random.randn(N, 5)
- y = np.where(x[:, 0] > 0, 2, 5)
- y = y + np.where(x[:, 1] > 0, -3, 3)
- y = y + np.where(x[:, 2] > 0, 0, 0.5)
- y = y + np.random.randn(N)*100
- return x,y
- class TreeNode:
- def predict(x, y):
- assert False
- def depth(self):
- assert False
- class BranchNode(TreeNode):
- def __init__(self, left, right, split_var_index, split_var_value):
- self.left = left
- self.right = right
- self.split_var_index = split_var_index
- self.split_var_value = split_var_value
- def predict(self, x):
- svar = x[:, self.split_var_index]
- is_left = svar < self.split_var_value
- leftx = x[is_left]
- rightx = x[~is_left]
- rv = np.zeros(x.shape[0])
- rv[is_left] = self.left.predict(leftx)
- rv[~is_left] = self.right.predict(rightx)
- return rv
- def depth(self):
- return 1 + max(self.left.depth(), self.right.depth())
- class LeafNode(TreeNode):
- def __init__(self, mu):
- self.mu = mu
- def predict(self, x):
- return np.repeat(self.mu, x.shape[0])
- def depth(self):
- return 1
- class RegressionTree:
- def __init__(self, max_depth, min_points_in_leaf):
- self.max_depth = max_depth
- self.min_points_in_leaf = min_points_in_leaf
- def predict(self, x):
- assert self.fitted
- return self.root.predict(x)
- def fit(self, x, y):
- self.fitted = True
- self.root = self.fit_internal(x, y, 1)
- def fit_internal(self, x, y, current_depth):
- # implement this
- num_features = x.shape[1]
- num_rows = x.shape[0]
- var_orig = np.var(y)
- if current_depth == self.max_depth:
- return LeafNode(np.mean(y))
- best_variable = None
- candidate_split_index = np.linspace(0, num_rows-1, 6, dtype = int)
- data = np.concatenate((x,y),axis = 1)
- # Here, we have to loop over all features and figure out which one
- # might be splittable, and if it is, how to split it to maximize Variance Reduction
- b_attribute_index, b_value, b_vr = None, 999, 0
- for i in range(num_features):
- # a lot of code goes here
- svar = data[:,i]
- cardinality = len(svar)
- for index in candidate_split_index:
- is_left = svar < svar[index]
- left_child = data[is_left]
- right_child = data[~is_left]
- vr = variance_reduction(var_orig, cardinality, (left_child[:,num_features], right_child[:,num_features]))
- if vr > b_vr and vr < var_orig:
- b_attribute_index = i
- b_value = data[index][i]
- b_vr = vr
- best_variable = b_attribute_index
- if best_variable is None:
- return LeafNode(np.mean(y))
- else:
- is_left = data[:,best_variable] < b_value
- left_child_data = data[is_left]
- right_child_data = data[~is_left]
- left_number_rows = left_child_data.shape[0]
- right_number_rows = right_child_data.shape[0]
- if current_depth + 1 >= self.max_depth:
- return LeafNode(np.mean(y))
- else:
- if left_number_rows > self.min_points_in_leaf and right_number_rows <= self.min_points_in_leaf:
- #return BranchNode(....) FILL THIS IN
- left_child_x = np.reshape(left_child_data[:,0:num_features],(left_number_rows, num_features))
- left_child_y = np.reshape(left_child_data[:,num_features],(left_number_rows,1))
- # right_child_x = np.reshape(right_child_data[:,0:num_features],(right_number_rows, num_features))
- # right_child_y = np.reshape(right_child_data[:,num_features],(right_number_rows,1))
- # return BranchNode(self.fit_internal(left_child_x, left_child_y, current_depth + 1),
- # self.fit_internal(right_child_x, right_child_y, current_depth + 1),
- # best_variable, b_value)
- final_right_child = np.reshape(right_child_data[:,num_features],(right_number_rows,1))
- return BranchNode(self.fit_internal(left_child_x, left_child_y, current_depth + 1),
- LeafNode(np.mean(final_right_child)),
- best_variable, b_value)
- if left_number_rows <= self.min_points_in_leaf and right_number_rows > self.min_points_in_leaf:
- right_child_x = np.reshape(right_child_data[:,0:num_features],(right_number_rows, num_features))
- right_child_y = np.reshape(right_child_data[:,num_features],(right_number_rows,1))
- final_left_child = np.reshape(left_child_data[:,num_features],(left_number_rows,1))
- return BranchNode(LeafNode(np.mean(final_left_child)),
- self.fit_internal(right_child_x, right_child_y, current_depth + 1),
- best_variable, b_value)
- if left_number_rows <= self.min_points_in_leaf and right_number_rows <= self.min_points_in_leaf:
- final_left_child = np.reshape(left_child_data[:,num_features],(left_number_rows,1))
- final_right_child = np.reshape(right_child_data[:, num_features], (right_number_rows, 1))
- return BranchNode(LeafNode(np.mean(final_left_child)),LeafNode(np.mean(final_right_child)),
- best_variable, b_value)
- else:
- left_child_x = np.reshape(left_child_data[:,0:num_features],(left_number_rows, num_features))
- left_child_y = np.reshape(left_child_data[:,num_features],(left_number_rows,1))
- right_child_x = np.reshape(right_child_data[:,0:num_features],(right_number_rows, num_features))
- right_child_y = np.reshape(right_child_data[:,num_features],(right_number_rows,1))
- return BranchNode(self.fit_internal(left_child_x, left_child_y, current_depth + 1),
- self.fit_internal(right_child_x, right_child_y, current_depth + 1),
- best_variable, b_value)
- def depth(self):
- return self.root.depth()
- def calculate_prediction_error(self, x, observed_y):
- predicted_y = self.root.predict(x)
- no_rows = predicted_y.shape[0]
- predicted_y = np.reshape(predicted_y, (no_rows, 1))
- return 1- np.sum((predicted_y - observed_y)**2)/np.sum((observed_y - np.mean(observed_y))**2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement