Untitled

import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import minimize_scalar

from collections import namedtuple


Node = namedtuple('Node', ('feature', 'value', 'impurity', 'left', 'right'))
Leaf = namedtuple('Leaf', ('value', 'x', 'y'))
#node = Node(feature = 0, value = 2, impurity =0.1, left=node(), right())


def partition(x, y, feature, value):
    i_right = x[:, feature] >= value
    i_left = np.logical_not(i_right)
    return (x[i_left], y[i_left]), (x[i_right], y[i_right])

def criteria(y):
    return np.std(y)**2


def impurity(y_left, y_right):
    size = y_left.size + y_right.size
    h = (y_left.size * criteria(y_left) + y_right.size * criteria(y_right))/size
    #h = const - left.size*right.size*(left.mean() - right.mean())**2
    return h


def f(value, feature, x, y):
    ( _ , y_left), ( _ , y_right) = partition(x, y, feature, value)
    return impurity(y_left, y_right)


def find_best_split(x, y):

    best_feature, best_value, best_impurity = 0, x[0, 0], np.inf
    for feature in range(x.shape[1]):
        x_i_sorted = np.sort(x[:, feature])
        result = minimize_scalar(
                f,
                args=(feature, x, y),
                method='Bounded',
                bounds=(x_i_sorted[1], x_i_sorted[-1]),
            )
        assert result.success
        value = result.x
        impurity = result.fun
        if impurity < best_impurity:
            best_feature, best_value, best_impurity = feature, value, impurity
    return best_feature, best_value, best_impurity


def build_tree(x, y, depth=1, max_depth=np.inf):
    if depth >= max_depth or criteria(y) < 1e-6:
        return Leaf(np.mean(y), x, y)

    feature, value, impurity = find_best_split(x, y)
    (x_left, y_left), (x_right, y_right) = partition(x, y, feature, value)


    left = build_tree(x_left, y_left, depth+1, max_depth)
    right = build_tree(x_right, y_right, depth+1, max_depth)

    root = Node(feature, value, impurity, left, right)
    return root

def predict(tree, x):
    y= np.empty(x.shape[0])
    for i, row in enumerate(x):
        node = tree
        while isinstance(node, Node):
            if row[node.feature] >= node.value:
                node = node.right
            else:
                node = node.left
        y[i] = node.value
    return y

""" y = 2*x0 +1 """
n=100
x = np.random.normal(0, 1, size=(n,2))
y_true = 2* x[:, 0] + 1
y = y_true + np.random.normal(0, 0.5, n)
tree = build_tree(x, y)
plt.plot(y_true, y, 'o')
x_test = np.random.normal(0, 1, size=(n,2))
y_test = 2 * x_test[:, 0] + 1
y_pred = predict(tree, x_test)
plt.plot(y_test, y_pred, 'v')


"""  y = 1 + 2*x0 + 3*x1**2     """

plt.figure()
plt.xscale('log')
plt.yscale('log')
n=10000
rs = np.random.RandomState(1)
x = rs.normal(0, 1, size=(n,2))
y_true = 2* x[:, 0]**2 + 1 + x[:, 1]**2
y = y_true + np.random.normal(0, 0.5, n)
tree = build_tree(x, y)
plt.plot(y_true, y, 'o')
x_test = rs.normal(2, 1, size=(n,2))
y_test = 2* x_test[:, 0]**2 + 1 + x_test[:, 1]**2
y_pred = predict(tree, x_test)
plt.plot(y_test, y_pred, 'v')
print(np.std(y_test, y_pred))
plt.plot(plt.xlim(), plt.xlim(), 'k', lw=0.5)


#
##прямая
#n=1000
#x = np.random.normal(0, 1, size=(n, 2))
#y = np.asarray(x[:, 0] > 0, dtype=int)
#tree = build_tree(x, y)
#COLORS = np.array([[1.,0.,0.], [0.,0.,1.]])
#plt.scatter(*x.T, color=COLORS[y])
#x_test = np.random.normal(0, 1, size=(n, 2))
#y_pred = predict(tree, x_test).astype(np.int)
##plt.scatter(*x_test.T, color=COLORS[y_pred], marker='v', s=50)
#
#
##окружность
#plt.figure(figsize=(5,5))
#plt.xlim([-2,2])
#plt.ylim([-2,2])
#n=1000
#x = np.random.normal(0, 1, size=(n, 2))
#y = np.asarray(x[:,0]**2 + x[:,1]**2 <= 1, dtype=int)
#tree = build_tree(x, y)
#COLORS = np.array([[1.,0.,0.], [0.,0.,1.]])
#plt.scatter(*x.T, color=COLORS[y])
#x_test = np.random.normal(0, 1, size=(n, 2))
#y_pred = predict(tree, x_test).astype(np.int)
##plt.scatter(*x_test.T, color=COLORS[y_pred], marker='v', s=50)