Untitled

import numpy as np
from scipy import optimize


def read_data(dataset='learn.txt'):
    y = []
    x = []
    lines = 0
    cols = 0
    with open(dataset) as f:
        for line in f:
            xy = line.split()
            xy = [float(i) for i in xy]
            y.append(xy[1])

            poly = xy[2::]
            x.extend(poly)

            cols = len(poly)
            lines += 1
            if lines > 150:
                break

    return np.reshape(x, (lines, cols)), np.reshape(y, (lines, 1))


def target(theta, x, y, sign=1.0):
    m = len(y)
    res = 0.0
    for i in range(m):
        for j in range(m):
            res = theta[i] * theta[j] * y[i] * y[j] * (np.dot(x[i].T, x[j]))

    return sign * (np.sum(theta) - res / 2.0)


def jacobian(theta, x, y, sign=1.0):
    jac = []
    m = len(y)
    for i in range(m):
        t = 1.0
        if y[i] != 0:
            for j in range(m):
                t -= theta[j] * y[i] * y[j] * (np.dot(x[i].T, x[j]))

        jac.extend([sign * t])

    return np.array(jac)


def constrains(y, c):
    return (
        {'type': 'eq',
         'fun': lambda theta: np.dot(theta.T, y),
         'jac': lambda x: y.T}
    )


def bounds(m, c):
    b = []
    for i in range(m):
        b.extend([(0, c)])

    return tuple(b)


def optimize_dual(x, y, c):
    m = len(y)
    print("Optimization begin")
    res = optimize.minimize(
        fun=target,
        x0=(np.zeros((m, 1))),
        args=(x, y, -1.0),
        jac=jacobian,
        constraints=constrains(y, c),
        bounds=bounds(m, c),
        method='SLSQP',
        options={'disp': True}
    )

    return np.reshape(res.x, (len(res.x), 1))


def solve_straight(theta, x, y):
    m = len(y)
    beta = np.zeros((1, x.shape[1]))
    for i in range(m):
        beta += theta[i] * y[i] * x[i]

    b0 = np.dot(beta, x[0].T) - y[0]

    return np.vstack((b0, beta.T))


def svm(x, y, c):
    theta = optimize_dual(x, y, c)
    return solve_straight(theta, x, y)


def svm_hypothesis(beta, x):
    h = np.dot(x, beta)
    return 1.0 if h > 0 else -1.0


# one vs all
def multi_svm(x, y, c):
    cnt_classes = count_classes(y)
    betas = np.zeros((cnt_classes, x.shape[1]))

    for i in range(cnt_classes):
        yt = np.zeros(y.shape)
        for j in range(len(y)):
            yt[j] = 1.0 if int(y[j]) == i else 0

        print("Left: ", cnt_classes - i)
        beta = svm(x, yt, c)
        for j in range(len(betas[i])):
            betas[i][j] = beta[j]

    return betas


def predict_multi_svm(x, betas):
    num = x.shape[0]
    p = np.zeros(num)
    len_betas = len(betas)
    for i in range(num):
        for c in range(len_betas):
            h = svm_hypothesis(betas[c], x[i])
            if h >= 0.0:
                p[i] = c

    return p


def count_classes(y):
    return max(y) + 1


def normalize_params(x):
    return np.mean(x, axis=0), np.std(x, axis=0)


def normalize(x, means, stds):
    for i in range(x.shape[1]):
        if stds[i] > 0:
            x[:, i] -= means[i]
            x[:, i] /= stds[i]

    return x


def add_ones(x):
    return np.hstack([np.ones((x.shape[0], 1)), x])


def drop_with_zero_dev(x, stds):
    shift = 0
    for i in range(x.shape[1]):
        if stds[i] == 0:
            x = np.delete(x, i - shift, 1)
            shift += 1

    return x


def cohen(p, y):
    cnt = len(p)

    agreement = 0.0
    classes = count_classes(y)
    info_p = np.zeros(classes)
    info_y = np.zeros(classes)

    for i in range(cnt):
        if p[i] == y[i]:
            agreement += 1
        info_p[p[i]] += 1
        info_y[int(y[i])] += 1

    f_cnt2 = float(cnt) * cnt
    pr_a = agreement / float(cnt)
    pr_e = 0.0
    for i in range(classes):
        pr_e += info_p[i] * info_y[i] / f_cnt2

    return (pr_a - pr_e) / (1 - pr_e)


def summarize_svm(learn_x, learn_y, test_x, test_y):
    betas = multi_svm(learn_x, learn_y, c=1.0)
    predicted = predict_multi_svm(test_x, betas)

    kappa = cohen(predicted, test_y)

    print("Cohen's kappa: " + str(kappa))


learn_x, learn_y = read_data()
test_x, test_y = read_data('test.txt')


means, stds = normalize_params(learn_x)

learn_x = normalize(learn_x, means, stds)
test_x = normalize(test_x, means, stds)

learn_x = drop_with_zero_dev(learn_x, stds)
test_x = drop_with_zero_dev(test_x, stds)

print('Optimization begin:')

learn_x = add_ones(learn_x)
test_x = add_ones(test_x)

summarize_svm(learn_x, learn_y, test_x, test_y)