Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sklearn.base import BaseEstimator
- class DecisionTree(BaseEstimator):
- class Tree:
- def __init__(self):
- self.children = [None, None]
- self.feature = None
- self.thres = None
- self.answer = None
- def set_feature(self, feature, thres):
- self.feature = feature
- self.thres = thres
- def set_answer(self, ans, regression, n_classes):
- self.answer = ans.mean()
- def get_answer(self):
- return self.answer
- def get_next(self, obj):
- if obj[self.feature] < self.thres:
- return self.children[0]
- else:
- return self.children[1]
- def __init__(self, max_depth=np.inf, min_samples_split=2,
- criterion='gini', debug=False):
- self.max_depth = max_depth
- self.min_samples_split = min_samples_split
- criteria = {'gini': gini, 'entropy': entropy, 'variance': variance, 'mad_median': mad_median}
- self.f_criterion = criteria[criterion]
- def create_partition(self, sub_sample_indices, depth):
- node = self.Tree()
- # проверяем условия остановки
- if sub_sample_indices.size < self.min_samples_split or depth > self.max_depth:
- node.set_answer(self.y[sub_sample_indices], self.regression, self.n_classes)
- return node
- # выбираем самый удачный порог для разделения -- номер признака и значение порога
- best_thres = (0, 0) # (best_feature, best_feature_value)
- best_Q = 0
- for feature in range(self.n_features):
- # если значений признака слишком много, возьмём 100 квантилей. Если меньше 100,
- # то будем идти непосредственно по значениям признака
- feature_values = np.unique(percentiles(100, self.X[sub_sample_indices, feature]))
- for value in feature_values:
- left_indices = sub_sample_indices[self.X[sub_sample_indices,feature] < value]
- right_indices = sub_sample_indices[self.X[sub_sample_indices,feature] >= value]
- new_Q = self.Q(left_indices, right_indices)
- if new_Q > best_Q:
- best_Q = new_Q
- best_thres = (feature, value)
- # разделяем выборку по порогу и запускаемся из левого и правого сына
- best_feature = best_thres[0]
- best_value = best_thres[1]
- left_indices = sub_sample_indices[self.X[sub_sample_indices,best_feature] < best_value]
- right_indices = sub_sample_indices[self.X[sub_sample_indices,best_feature] >= best_value]
- node.set_feature(best_feature, best_value)
- node.children[0] = self.create_partition(left_indices, depth+1)
- node.children[1] = self.create_partition(right_indices, depth+1)
- return node
- def Q(self, left_indices, right_indices):
- F = self.f_criterion
- all_indices = np.concatenate([left_indices, right_indices])
- return F(self.y[all_indices]) - left_indices.size / all_indices.size * F(self.y[left_indices]) - \
- right_indices.size / all_indices.size * F(self.y[right_indices])
- def fit(self, X, y):
- self.n_features = X.shape[1]
- self.n_obj = y.size
- self.y = y
- self.X = X
- self.root = self.create_partition(np.arange(self.n_obj), 1)
- return self
- def predict_proba(self, X):
- answer = np.zeros((X.shape[0], self.n_classes))
- i = 0
- for obj in tqdm(X):
- cur_node = self.root
- while cur_node.get_answer() is None:
- cur_node = cur_node.get_next(obj)
- answer[i,:] = cur_node.get_answer()
- i += 1
- return answer
Add Comment
Please, Sign In to add comment