Untitled

__author__ = 'Pavel Yurgin'

import numpy as np


def read_data(path):
    Xy = np.genfromtxt(path, delimiter=',')
    X = np.ndarray((Xy.shape))
    X[:, 0] = -1
    X[:, 1:] = Xy[:, :-1]
    y = Xy[:, -1]
    y[y == 0] -= 1
    return X, y

def normalize(X):
    X = (X - np.mean(X)) / np.std(X)
    return X

def log_loss(M):
    return np.log2(1 + np.exp(-M)), - 1 / ((np.exp(M) + 1) * np.log(2))


def sigmoid_loss(M):
    return 2 / (1 + np.exp(M)), -2 * np.exp(M) / (np.exp(M) + 1) ** 2


def euclid_distance(X, Y):
    dist = np.sqrt(np.sum((X - Y) ** 2))
    return dist


class GradientDescent:
    def __init__(self, *, alpha, threshold=1e-3, loss=sigmoid_loss):
        self.weights = []
        if alpha <= 0:
            raise ValueError("alpha should be positive")
        if threshold <= 0:
            raise ValueError("threshold should be positive")
        self.alpha = alpha
        self.threshold = threshold
        self.loss = loss

    def fit(self, X, y):
        n = X.shape[1]
        self.weights = np.random.uniform(-1 / (2 * n), 1 / (2 * n), size=n)
        errors = []
        it = 0
        while True:
            M = np.dot(X, self.weights) * y
            loss, derivative = self.loss(M)
            grad_q = np.sum((derivative.T * (X.T * y)).T, axis=0)
            tmp = self.weights - self.alpha * grad_q
            errors.append(np.sum(loss))
            if euclid_distance(tmp, self.weights) < self.threshold:
                break
            self.weights = tmp
            it += 1
            if (it % 10000 == 0):
                print(it, np.sum(loss))
        return errors

    def predict(self, X):
        return np.sign(np.dot(X, self.weights))


class SGD:
    def __init__(self, *, alpha, loss=log_loss, k=1, n_iter=100):
        if alpha <= 0:
            raise ValueError("alpha should be positive")
        if k <= 0 or not isinstance(k, int):
            raise ValueError("k should be a positive integer")
        if n_iter <= 0 or not isinstance(n_iter, int):
            raise ValueError("n_iter should be a positive integer")
        self.k = k
        self.n_iter = n_iter
        self.alpha = alpha
        self.loss = loss

    def fit(self, X, y):
        n = X.shape[1]
        self.weights = np.random.uniform(-1 / (2 * n), 1 / (2 * n), size=n)
        errors = []
        eta = 1 / len(X)
        q = self.loss(np.dot(X, self.weights) * y)[0].sum()
        for i in range(self.n_iter):
            idx = np.random.choice(X.shape[0], size=self.k)
            x = X[idx]
            sub_y = y[idx]
            M = np.dot(x, self.weights) * sub_y
            loss, derivative = self.loss(M)
            grad_q = np.sum((derivative.T * (x.T * sub_y)).T, axis=0)
            self.weights = self.weights - self.alpha * grad_q
            q = (1 - eta) * q + eta * np.sum(loss)
            errors.append(q)
        return errors

    def predict(self, X):
        return np.sign(np.dot(X, self.weights))


def get_precision_and_recall(y_pred, y_test, c):
    TP = len([True for i, j in zip(y_pred, y_test) if i == j and j == c])
    FP = len([True for i, j in zip(y_pred, y_test) if i == c and j != c])
    FN = len([True for i, j in zip(y_pred, y_test) if i != c and j != i])

    if TP + FP == 0:
        precission = 0
    else:
        precission = TP / (TP + FP)

    if TP + FP == 0:
        recall = 0
    else:
        recall = TP / (TP + FN)

    return precission, recall


def print_precision_recall(y_pred, y_test):
    classes = np.unique(y_test)
    for c in classes:
        precision, recall = get_precision_and_recall(y_pred, y_test, c)
        print(c, precision, recall)


def train_test_split(X, y, ratio):
    sorted_args = np.argsort(y)
    X = X[sorted_args]
    y = y[sorted_args]
    bound = int(len(X) * ratio)
    X_test, y_test = X[:bound], y[:bound]
    X_train, y_train = X[bound:], y[bound:]
    return X_test, y_test, X_train, y_train


def test_alghoritm(X, y):
    alpha = [1e-6, 1e-4, 1e-2, 1]
    loss_functions = [log_loss, sigmoid_loss]
    for loss in loss_functions:
        for a in alpha:
            gd = GradientDescent(alpha=a, loss=loss)
            q = gd.fit(X, y)


if __name__ == '__main__':
    X, y = read_data("pima-indians-diabetes.csv")
    X = normalize(X)
    gd = GradientDescent(alpha=1e-2, threshold=1e-3, loss=sigmoid_loss)
    #err = gd.fit(X, y)
    #print(err)
    #pred = gd.predict(X)
    #print(pred)
    #print(y)
    #err = gd.fit(X, y)
    #print(err)
    #pred = gd.predict(X)
    #print(pred)
    #print(get_precision_and_recall(pred, y, 1))
    #print(get_precision_and_recall(pred, y, -1))

    sgd = SGD(alpha=1e-2, k=10, n_iter=1000)
    err = sgd.fit(X, y)
    print(err)
    pred = sgd.predict(X)
    print(get_precision_and_recall(pred, y, 1))
    print(get_precision_and_recall(pred, y, -1))