Untitled

import numpy as np
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import re
import matplotlib.pyplot as plt

option = 2 # 0: error vs gamma, 1: error vs size, 2: bayesian

positive = [map(float, re.split(r' [0-9]+\:', i[1:-2])[1:]) for i in open("positive.dat").readlines()]
negative = [map(float, re.split(r' [0-9]+\:', i[1:-2])[1:]) for i in open("negative.dat").readlines()]

if option == 0:
    sizes = [1000, 2000, 3000, 4000, 5000]
    colors = ['b-', 'g-', 'r-', 'c-', 'm-']
    index = 0

    for s in sizes:
        SIZE = s

        X = np.concatenate((positive[0:SIZE/2], negative[0:SIZE/2]))
        y = np.concatenate((np.full((SIZE/2, 1), 1), np.full((SIZE/2, 1), -1))).ravel()

        gammaSequence = np.arange(0.000001, 0.0005, 0.00002)

        errors = []

        for i in gammaSequence:
            clf = SVC(kernel='poly', gamma=i)
            clf.fit(X, y)
            error = (list(clf.predict(positive[SIZE/2:])).count(-1) / float(5000 - (SIZE / 2)) + list(clf.predict(negative[SIZE/2:])).count(1)) / float(5000 - (SIZE / 2))
            errors.append(error)

        plt.plot(gammaSequence, errors, colors[index], label=str(sizes[index]) + ' min: ' + str(min(errors)) + ' at gamma: ' + str(gammaSequence[np.argmin(errors)]))
        index = index + 1

    plt.xlabel('Gamma'); plt.ylabel('Error'); plt.title('Error vs training set size and gamma')
    plt.legend()
    plt.show()
elif option == 1:
    sizes = np.arange(1000, 6000, 1000)
    errors = []

    for s in sizes:
        SIZE = s

        X = np.concatenate((positive[0:SIZE/2], negative[0:SIZE/2]))
        y = np.concatenate((np.full((SIZE/2, 1), 1), np.full((SIZE/2, 1), -1))).ravel()

        clf = SVC(kernel='poly', gamma=0.0001)
        clf.fit(X, y)
        error = (list(clf.predict(positive[SIZE/2:])).count(-1) / float(5000 - (SIZE / 2)) + list(clf.predict(negative[SIZE/2:])).count(1)) / float(5000 - (SIZE / 2))
        errors.append(error)

    plt.plot(sizes, errors, 'b-')
    plt.plot([1000, 5000], [min(errors), min(errors)], 'r--', label="Asymptotic Error: " + str(min(errors)))
    plt.xlabel('Training Size'); plt.ylabel('Error'); plt.title('Error vs training set at gamma = 0.0001')
    plt.legend()
    plt.show()
elif option == 2:
    sizes = np.arange(1000, 6000, 1000)
    errorsSVM = []
    errorsNB = []

    for s in sizes:
        SIZE = s

        X = np.concatenate((positive[0:SIZE/2], negative[0:SIZE/2]))
        y = np.concatenate((np.full((SIZE/2, 1), 1), np.full((SIZE/2, 1), -1))).ravel()

        clfSVM = SVC(kernel='poly', gamma=0.0001)
        clfSVM.fit(X, y)
        error = (list(clfSVM.predict(positive[SIZE/2:])).count(-1) / float(5000 - (SIZE / 2)) + list(clfSVM.predict(negative[SIZE/2:])).count(1)) / float(5000 - (SIZE / 2))
        errorsSVM.append(error)

        clfNB = GaussianNB()
        clfNB.fit(X, y)
        error = (list(clfNB.predict(positive[SIZE/2:])).count(-1) / float(5000 - (SIZE / 2)) + list(clfNB.predict(negative[SIZE/2:])).count(1)) / float(5000 - (SIZE / 2))
        errorsNB.append(error)

    plt.plot(sizes, errorsSVM, 'b-', label="SVM")
    plt.plot(sizes, errorsNB, 'r-', label="NB")
    plt.xlabel('Training Size'); plt.ylabel('Error'); plt.title('Error vs training set size, Poly SVM at gamma = 0.0001 and Gaussian NB')
    plt.legend()
    plt.show()