Untitled

import numpy as np
from sklearn import preprocessing
from sklearn.svm import SVC

'''
    Load and process the data
        > Separate positive and negative examples
        > Cut the two new sets in half
        > Concatenate to make two sets of equal pos/neg distributions
        > Standardize the data based off mean and std deviation of the train set
        > Shuffle the training and test sets
'''
print 'Loading data...'
data = np.loadtxt('spambase.txt', delimiter=',')
print 'Processing data...'
pos = data[data[:,57] == 1]
neg = data[data[:,57] == 0]
pos1 = pos[:len(pos)/2]
pos2 = pos[len(pos)/2:]
neg1 = neg[:len(neg)/2]
neg2 = neg[len(neg)/2:]
train_set = np.vstack((pos1, neg1))
test_set = np.vstack((pos2, neg2))
scaler = preprocessing.StandardScaler().fit(train_set)
train_set = scaler.transform(train_set)
test_set = scaler.transform(test_set)
np.random.shuffle(train_set)
np.random.shuffle(test_set)