Untitled

import numpy as np

# Our const
FILENAME = 'ex2data1.txt'
# Example splitting
TRAIN_RATIO = 0.7
CROSS_VALIDATION_RATIO = 0.1
# The test set will be the rest
# Learning const
ALPHA = 0.001
# Number of iterations
NUM_ITERS = 40000
# The size of a batch (if applies)
BATCH_SIZE = 40
# Polynomial to add
POLYNOMIAL = 1
# Lambda for regularization
LAMBDA = 0.5

def get_data():
    data = np.loadtxt(FILENAME, delimiter=',')
    x = data[:, :-1]
    y = data[:, [np.shape(data)[1] - 1]]
    return (x, y)

def normalize_data(X):
    for i in range(0, np.shape(X)[1]):
        std = np.std(X[:, i])
        mean = np.mean(X[:, i])
        X[:, i] = (X[:, 1] - mean) / std
    return X

def add_polynomials(X):
    for i in range(0, np.shape(X)[1]):
        newX = np.zeros((np.shape(X)[0], POLYNOMIAL - 1))
        for j in range(1, POLYNOMIAL - 1):
            newX[:, j] = X[:, i]**(j+1)
        X = np.concatenate((X, newX), 1)
    return X

def sigmoid(x):
    return 1 / (1+np.exp(-x))

def cost_function(X, y, theta):
    num_examples = np.shape(X)[0]
    h = sigmoid(np.dot(X, np.transpose(theta)))
    return ((ALPHA / num_examples) * sum((np.dot(np.transpose(y), np.log(h))) - np.dot(np.transpose(1 - y), np.log(1 - h))))[0]

# Our gradient function, uses the defined const
def batch_gradient(X, y, theta):
    num_examples = np.shape(X)[0]
    for i in range(0, NUM_ITERS):
        h = sigmoid(np.dot(X, np.transpose(theta)))
        theta = theta - ALPHA/num_examples * np.transpose((np.dot(np.transpose(X),(h - y))))
        if i % 10000 == 0:
            print(str(i) + '/' + str(NUM_ITERS))
            print('Cost function value: {:.10f}'.format(cost_function(X, y, theta)))
            test(X, y, theta)
    return theta

def mini_batch_gradient(X, y, theta):
    num_examples = np.shape(X)[0]
    for i in range(0, NUM_ITERS):
        for j in range(0, int(np.floor(num_examples / BATCH_SIZE))):
            x = X[(j * BATCH_SIZE):(j * BATCH_SIZE + 40), :]
            sub_y = y[(j * BATCH_SIZE):(j * BATCH_SIZE + 40), :]
            h = sigmoid(np.dot(x, np.transpose(theta)))
            theta = theta - ALPHA/num_examples * np.transpose((np.dot(np.transpose(x),(h - sub_y))))
        if i % 100 == 0:
            print(str(i) + '/' + str(NUM_ITERS))
            print('Cost function value: {:.10f}'.format(cost_function(X, y, theta)))
    return theta

def stochastic_gradient(X, y, theta):
    num_examples = np.shape(X)[0]
    for i in range(0, NUM_ITERS):
        for j in range(0, num_examples):
            x = X[j, :]
            sub_y = y[j, :]
            h = sigmoid(np.dot(x, np.transpose(theta)))
            theta = theta - ALPHA/num_examples * np.transpose(np.transpose(x) * (h - sub_y))
        if i % 100 == 0:
            print(str(i) + '/' + str(NUM_ITERS))
            print('Cost function value: {:.10f}'.format(cost_function(X, y, theta)))
    return theta

def predict(x, theta):
    pred = sigmoid(np.dot(x, np.transpose(theta)))
    if pred >= 0.5:
        return 1
    else:
        return 0

def train(X, y):
    return batch_gradient(X, y, np.zeros((1, np.shape(X)[1])))

def test(X, y, theta):
    good = 0
    falseNegative = 0
    falsePositive = 0
    for i in range(0, np.shape(X)[0]):
        pred = predict(X[i, :], theta)
        if pred == y[i][0]:
            good = good + 1
        elif pred == 0 and y[i][0] == 1:
            falseNegative = falseNegative + 1
        elif pred == 1 and y[i][0] == 0:
            falsePositive = falsePositive + 1
    print('Good: {}\n False negative: {}\n False positive: {}\n'.format(good / np.shape(X)[0], falseNegative / np.shape(X)[0], falsePositive / np.shape(X)[0]))
    return good / np.shape(X)[0]

def main():
    # Getting data
    x, y = get_data()
    # Normalizing data
    x = normalize_data(x)
    #Adding polynomials
    x = add_polynomials(x)
    # Adding bias
    bias = np.ones((np.shape(x)[0], np.shape(x)[1] + 1))
    bias[:, 1:] = x
    x = bias
    # Creating the sets
    numTraining = int(np.shape(x)[0] * TRAIN_RATIO)
    numCrossVal = int(np.shape(x)[0] * CROSS_VALIDATION_RATIO)
    numTest = np.shape(x)[0] - (numTraining + numCrossVal)
    trainingSetX = x[0:numTraining, :]
    crossValSetX = x[numTraining:(numTraining + numCrossVal), :]
    testSetX = x[(numTraining + numCrossVal):, :]
    trainingSetY = y[0:numTraining, :]
    crossValSetY = y[numTraining:(numTraining + numCrossVal), :]
    testSetY = y[(numTraining + numCrossVal):, :]
    # Training
    thetas = train(trainingSetX, trainingSetY)
    # Test
    pourcentSuccessTrain = test(trainingSetX, trainingSetY, thetas)
    pourcentSuccessCrossValidation = test(crossValSetX, crossValSetY, thetas)
    pourcentSuccessTest = test(testSetX, testSetY, thetas)
    print('Success training set: {:.2f} %'.format(pourcentSuccessTrain * 100))
    print('Success crossvalidation set: {:.2f} %'.format(pourcentSuccessCrossValidation * 100))
    print('Success test set: {:.2f} %'.format(pourcentSuccessTest * 100))

main()