Untitled

import numpy as np

# My functions
def variance_reduction(node_var, node_cardinality, child_groups):
    # After we split the data based on the each candidate split-point
    # of each attribute, we return the bucketed y-values, as a tuple.

    left, right = child_groups[0], child_groups[1]
    left_var = np.var(left)
    left_size = len(left)

    right_var = np.var(right)
    right_size = len(right)

    vr = node_var - (left_size * left_var + right_size * right_var) / node_cardinality
    return vr


# My functions


def generate_test_data(N):
    # np.random.seed(1)
    x = np.random.randn(N, 5)
    y = np.where(x[:, 0] > 0, 2, 5)
    y = y + np.where(x[:, 1] > 0, -3, 3)
    y = y + np.where(x[:, 2] > 0, 0, 0.5)
    y = y + np.random.randn(N)*100
    return x,y

class TreeNode:
    def predict(x, y):
        assert False

    def depth(self):
        assert False

class BranchNode(TreeNode):
    def __init__(self, left, right, split_var_index, split_var_value):
        self.left = left
        self.right = right
        self.split_var_index = split_var_index
        self.split_var_value = split_var_value

    def predict(self, x):
        svar = x[:, self.split_var_index]
        is_left = svar < self.split_var_value
        leftx = x[is_left]
        rightx = x[~is_left]

        rv = np.zeros(x.shape[0])
        rv[is_left] = self.left.predict(leftx)
        rv[~is_left] = self.right.predict(rightx)

        return rv

    def depth(self):
        return 1 + max(self.left.depth(), self.right.depth())

class LeafNode(TreeNode):
    def __init__(self, mu):
        self.mu = mu

    def predict(self, x):
        return np.repeat(self.mu, x.shape[0])

    def depth(self):
        return 1

class RegressionTree:
    def __init__(self, max_depth, min_points_in_leaf):
        self.max_depth = max_depth
        self.min_points_in_leaf = min_points_in_leaf

    def predict(self, x):
        assert self.fitted
        return self.root.predict(x)

    def fit(self, x, y):
        self.fitted = True
        self.root = self.fit_internal(x, y, 1)

    def fit_internal(self, x, y, current_depth):
        # implement this
        num_features = x.shape[1]
        num_rows = x.shape[0]
        var_orig = np.var(y)

        if current_depth == self.max_depth:
            return LeafNode(np.mean(y))

        best_variable = None

        candidate_split_index = np.linspace(0, num_rows-1, 6, dtype = int)

        data = np.concatenate((x,y),axis = 1)

        # Here, we have to loop over all features and figure out which one
        # might be splittable, and if it is, how to split it to maximize Variance Reduction

        b_attribute_index, b_value, b_vr = None, 999, 0
        for i in range(num_features):
            # a lot of code goes here
            svar = data[:,i]
            cardinality = len(svar)

            for index in candidate_split_index:
                is_left = svar < svar[index]
                left_child = data[is_left]
                right_child = data[~is_left]

                vr = variance_reduction(var_orig, cardinality, (left_child[:,num_features], right_child[:,num_features]))

                if vr > b_vr and vr < var_orig:
                    b_attribute_index = i
                    b_value = data[index][i]
                    b_vr = vr


        best_variable = b_attribute_index
        if best_variable is None:
            return LeafNode(np.mean(y))
        else:
            is_left = data[:,best_variable] < b_value
            left_child_data = data[is_left]
            right_child_data = data[~is_left]

            left_number_rows = left_child_data.shape[0]
            right_number_rows = right_child_data.shape[0]

            if current_depth + 1 >= self.max_depth:
                return LeafNode(np.mean(y))
            else:
                if left_number_rows > self.min_points_in_leaf and right_number_rows <= self.min_points_in_leaf:
                #return BranchNode(....) FILL THIS IN
                    left_child_x = np.reshape(left_child_data[:,0:num_features],(left_number_rows, num_features))
                    left_child_y = np.reshape(left_child_data[:,num_features],(left_number_rows,1))

                    # right_child_x = np.reshape(right_child_data[:,0:num_features],(right_number_rows, num_features))
                    # right_child_y = np.reshape(right_child_data[:,num_features],(right_number_rows,1))

                    # return BranchNode(self.fit_internal(left_child_x, left_child_y, current_depth + 1),
                    #                   self.fit_internal(right_child_x, right_child_y, current_depth + 1),
                    #                   best_variable, b_value)
                    final_right_child = np.reshape(right_child_data[:,num_features],(right_number_rows,1))
                    return BranchNode(self.fit_internal(left_child_x, left_child_y, current_depth + 1),
                                      LeafNode(np.mean(final_right_child)),
                                      best_variable, b_value)
                if left_number_rows <= self.min_points_in_leaf and right_number_rows > self.min_points_in_leaf:
                    right_child_x = np.reshape(right_child_data[:,0:num_features],(right_number_rows, num_features))
                    right_child_y = np.reshape(right_child_data[:,num_features],(right_number_rows,1))
                    final_left_child = np.reshape(left_child_data[:,num_features],(left_number_rows,1))
                    return BranchNode(LeafNode(np.mean(final_left_child)),
                                      self.fit_internal(right_child_x, right_child_y, current_depth + 1),
                                      best_variable, b_value)
                if left_number_rows <= self.min_points_in_leaf and right_number_rows <= self.min_points_in_leaf:
                    final_left_child = np.reshape(left_child_data[:,num_features],(left_number_rows,1))
                    final_right_child = np.reshape(right_child_data[:, num_features], (right_number_rows, 1))
                    return BranchNode(LeafNode(np.mean(final_left_child)),LeafNode(np.mean(final_right_child)),
                                      best_variable, b_value)
                else:
                    left_child_x = np.reshape(left_child_data[:,0:num_features],(left_number_rows, num_features))
                    left_child_y = np.reshape(left_child_data[:,num_features],(left_number_rows,1))

                    right_child_x = np.reshape(right_child_data[:,0:num_features],(right_number_rows, num_features))
                    right_child_y = np.reshape(right_child_data[:,num_features],(right_number_rows,1))

                    return BranchNode(self.fit_internal(left_child_x, left_child_y, current_depth + 1),
                                      self.fit_internal(right_child_x, right_child_y, current_depth + 1),
                                      best_variable, b_value)

    def depth(self):
        return self.root.depth()

    def calculate_prediction_error(self, x, observed_y):

        predicted_y = self.root.predict(x)
        no_rows = predicted_y.shape[0]
        predicted_y = np.reshape(predicted_y, (no_rows, 1))
        return 1- np.sum((predicted_y - observed_y)**2)/np.sum((observed_y - np.mean(observed_y))**2)