Advertisement
Guest User

Untitled

a guest
Apr 29th, 2017
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.34 KB | None | 0 0
  1. import numpy as np
  2.  
  3. # My functions
  4. def variance_reduction(node_var, node_cardinality, child_groups):
  5. # After we split the data based on the each candidate split-point
  6. # of each attribute, we return the bucketed y-values, as a tuple.
  7.  
  8. left, right = child_groups[0], child_groups[1]
  9. left_var = np.var(left)
  10. left_size = len(left)
  11.  
  12. right_var = np.var(right)
  13. right_size = len(right)
  14.  
  15. vr = node_var - (left_size * left_var + right_size * right_var) / node_cardinality
  16. return vr
  17.  
  18.  
  19.  
  20. # My functions
  21.  
  22.  
  23. def generate_test_data(N):
  24. # np.random.seed(1)
  25. x = np.random.randn(N, 5)
  26. y = np.where(x[:, 0] > 0, 2, 5)
  27. y = y + np.where(x[:, 1] > 0, -3, 3)
  28. y = y + np.where(x[:, 2] > 0, 0, 0.5)
  29. y = y + np.random.randn(N)*100
  30. return x,y
  31.  
  32. class TreeNode:
  33. def predict(x, y):
  34. assert False
  35.  
  36. def depth(self):
  37. assert False
  38.  
  39. class BranchNode(TreeNode):
  40. def __init__(self, left, right, split_var_index, split_var_value):
  41. self.left = left
  42. self.right = right
  43. self.split_var_index = split_var_index
  44. self.split_var_value = split_var_value
  45.  
  46. def predict(self, x):
  47. svar = x[:, self.split_var_index]
  48. is_left = svar < self.split_var_value
  49. leftx = x[is_left]
  50. rightx = x[~is_left]
  51.  
  52. rv = np.zeros(x.shape[0])
  53. rv[is_left] = self.left.predict(leftx)
  54. rv[~is_left] = self.right.predict(rightx)
  55.  
  56. return rv
  57.  
  58. def depth(self):
  59. return 1 + max(self.left.depth(), self.right.depth())
  60.  
  61. class LeafNode(TreeNode):
  62. def __init__(self, mu):
  63. self.mu = mu
  64.  
  65. def predict(self, x):
  66. return np.repeat(self.mu, x.shape[0])
  67.  
  68. def depth(self):
  69. return 1
  70.  
  71. class RegressionTree:
  72. def __init__(self, max_depth, min_points_in_leaf):
  73. self.max_depth = max_depth
  74. self.min_points_in_leaf = min_points_in_leaf
  75.  
  76. def predict(self, x):
  77. assert self.fitted
  78. return self.root.predict(x)
  79.  
  80. def fit(self, x, y):
  81. self.fitted = True
  82. self.root = self.fit_internal(x, y, 1)
  83.  
  84. def fit_internal(self, x, y, current_depth):
  85. # implement this
  86. num_features = x.shape[1]
  87. num_rows = x.shape[0]
  88. var_orig = np.var(y)
  89.  
  90. if current_depth == self.max_depth:
  91. return LeafNode(np.mean(y))
  92.  
  93. best_variable = None
  94.  
  95. candidate_split_index = np.linspace(0, num_rows-1, 6, dtype = int)
  96.  
  97. data = np.concatenate((x,y),axis = 1)
  98.  
  99. # Here, we have to loop over all features and figure out which one
  100. # might be splittable, and if it is, how to split it to maximize Variance Reduction
  101.  
  102. b_attribute_index, b_value, b_vr = None, 999, 0
  103. for i in range(num_features):
  104. # a lot of code goes here
  105. svar = data[:,i]
  106. cardinality = len(svar)
  107.  
  108. for index in candidate_split_index:
  109. is_left = svar < svar[index]
  110. left_child = data[is_left]
  111. right_child = data[~is_left]
  112.  
  113. vr = variance_reduction(var_orig, cardinality, (left_child[:,num_features], right_child[:,num_features]))
  114.  
  115. if vr > b_vr and vr < var_orig:
  116. b_attribute_index = i
  117. b_value = data[index][i]
  118. b_vr = vr
  119.  
  120.  
  121. best_variable = b_attribute_index
  122. if best_variable is None:
  123. return LeafNode(np.mean(y))
  124. else:
  125. is_left = data[:,best_variable] < b_value
  126. left_child_data = data[is_left]
  127. right_child_data = data[~is_left]
  128.  
  129. left_number_rows = left_child_data.shape[0]
  130. right_number_rows = right_child_data.shape[0]
  131.  
  132. if current_depth + 1 >= self.max_depth:
  133. return LeafNode(np.mean(y))
  134. else:
  135. if left_number_rows > self.min_points_in_leaf and right_number_rows <= self.min_points_in_leaf:
  136. #return BranchNode(....) FILL THIS IN
  137. left_child_x = np.reshape(left_child_data[:,0:num_features],(left_number_rows, num_features))
  138. left_child_y = np.reshape(left_child_data[:,num_features],(left_number_rows,1))
  139.  
  140. # right_child_x = np.reshape(right_child_data[:,0:num_features],(right_number_rows, num_features))
  141. # right_child_y = np.reshape(right_child_data[:,num_features],(right_number_rows,1))
  142.  
  143. # return BranchNode(self.fit_internal(left_child_x, left_child_y, current_depth + 1),
  144. # self.fit_internal(right_child_x, right_child_y, current_depth + 1),
  145. # best_variable, b_value)
  146. final_right_child = np.reshape(right_child_data[:,num_features],(right_number_rows,1))
  147. return BranchNode(self.fit_internal(left_child_x, left_child_y, current_depth + 1),
  148. LeafNode(np.mean(final_right_child)),
  149. best_variable, b_value)
  150. if left_number_rows <= self.min_points_in_leaf and right_number_rows > self.min_points_in_leaf:
  151. right_child_x = np.reshape(right_child_data[:,0:num_features],(right_number_rows, num_features))
  152. right_child_y = np.reshape(right_child_data[:,num_features],(right_number_rows,1))
  153. final_left_child = np.reshape(left_child_data[:,num_features],(left_number_rows,1))
  154. return BranchNode(LeafNode(np.mean(final_left_child)),
  155. self.fit_internal(right_child_x, right_child_y, current_depth + 1),
  156. best_variable, b_value)
  157. if left_number_rows <= self.min_points_in_leaf and right_number_rows <= self.min_points_in_leaf:
  158. final_left_child = np.reshape(left_child_data[:,num_features],(left_number_rows,1))
  159. final_right_child = np.reshape(right_child_data[:, num_features], (right_number_rows, 1))
  160. return BranchNode(LeafNode(np.mean(final_left_child)),LeafNode(np.mean(final_right_child)),
  161. best_variable, b_value)
  162. else:
  163. left_child_x = np.reshape(left_child_data[:,0:num_features],(left_number_rows, num_features))
  164. left_child_y = np.reshape(left_child_data[:,num_features],(left_number_rows,1))
  165.  
  166. right_child_x = np.reshape(right_child_data[:,0:num_features],(right_number_rows, num_features))
  167. right_child_y = np.reshape(right_child_data[:,num_features],(right_number_rows,1))
  168.  
  169. return BranchNode(self.fit_internal(left_child_x, left_child_y, current_depth + 1),
  170. self.fit_internal(right_child_x, right_child_y, current_depth + 1),
  171. best_variable, b_value)
  172.  
  173. def depth(self):
  174. return self.root.depth()
  175.  
  176. def calculate_prediction_error(self, x, observed_y):
  177.  
  178. predicted_y = self.root.predict(x)
  179. no_rows = predicted_y.shape[0]
  180. predicted_y = np.reshape(predicted_y, (no_rows, 1))
  181. return 1- np.sum((predicted_y - observed_y)**2)/np.sum((observed_y - np.mean(observed_y))**2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement