Guest User

Untitled

a guest
Mar 20th, 2018
115
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.98 KB | None | 0 0
  1. from sklearn.base import BaseEstimator
  2. class DecisionTree(BaseEstimator):
  3. class Tree:
  4. def __init__(self):
  5. self.children = [None, None]
  6. self.feature = None
  7. self.thres = None
  8. self.answer = None
  9.  
  10. def set_feature(self, feature, thres):
  11. self.feature = feature
  12. self.thres = thres
  13.  
  14. def set_answer(self, ans, regression, n_classes):
  15. self.answer = ans.mean()
  16.  
  17. def get_answer(self):
  18. return self.answer
  19.  
  20. def get_next(self, obj):
  21. if obj[self.feature] < self.thres:
  22. return self.children[0]
  23. else:
  24. return self.children[1]
  25.  
  26. def __init__(self, max_depth=np.inf, min_samples_split=2,
  27. criterion='gini', debug=False):
  28. self.max_depth = max_depth
  29. self.min_samples_split = min_samples_split
  30. criteria = {'gini': gini, 'entropy': entropy, 'variance': variance, 'mad_median': mad_median}
  31. self.f_criterion = criteria[criterion]
  32.  
  33. def create_partition(self, sub_sample_indices, depth):
  34. node = self.Tree()
  35. # проверяем условия остановки
  36. if sub_sample_indices.size < self.min_samples_split or depth > self.max_depth:
  37. node.set_answer(self.y[sub_sample_indices], self.regression, self.n_classes)
  38. return node
  39. # выбираем самый удачный порог для разделения -- номер признака и значение порога
  40. best_thres = (0, 0) # (best_feature, best_feature_value)
  41. best_Q = 0
  42. for feature in range(self.n_features):
  43. # если значений признака слишком много, возьмём 100 квантилей. Если меньше 100,
  44. # то будем идти непосредственно по значениям признака
  45. feature_values = np.unique(percentiles(100, self.X[sub_sample_indices, feature]))
  46. for value in feature_values:
  47. left_indices = sub_sample_indices[self.X[sub_sample_indices,feature] < value]
  48. right_indices = sub_sample_indices[self.X[sub_sample_indices,feature] >= value]
  49. new_Q = self.Q(left_indices, right_indices)
  50. if new_Q > best_Q:
  51. best_Q = new_Q
  52. best_thres = (feature, value)
  53. # разделяем выборку по порогу и запускаемся из левого и правого сына
  54. best_feature = best_thres[0]
  55. best_value = best_thres[1]
  56. left_indices = sub_sample_indices[self.X[sub_sample_indices,best_feature] < best_value]
  57. right_indices = sub_sample_indices[self.X[sub_sample_indices,best_feature] >= best_value]
  58. node.set_feature(best_feature, best_value)
  59. node.children[0] = self.create_partition(left_indices, depth+1)
  60. node.children[1] = self.create_partition(right_indices, depth+1)
  61. return node
  62.  
  63.  
  64. def Q(self, left_indices, right_indices):
  65. F = self.f_criterion
  66. all_indices = np.concatenate([left_indices, right_indices])
  67. return F(self.y[all_indices]) - left_indices.size / all_indices.size * F(self.y[left_indices]) - \
  68. right_indices.size / all_indices.size * F(self.y[right_indices])
  69.  
  70. def fit(self, X, y):
  71. self.n_features = X.shape[1]
  72. self.n_obj = y.size
  73. self.y = y
  74. self.X = X
  75. self.root = self.create_partition(np.arange(self.n_obj), 1)
  76. return self
  77.  
  78. def predict_proba(self, X):
  79. answer = np.zeros((X.shape[0], self.n_classes))
  80. i = 0
  81. for obj in tqdm(X):
  82. cur_node = self.root
  83. while cur_node.get_answer() is None:
  84. cur_node = cur_node.get_next(obj)
  85. answer[i,:] = cur_node.get_answer()
  86. i += 1
  87. return answer
Add Comment
Please, Sign In to add comment