Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- from sklearn import preprocessing
- from sklearn.svm import SVC
- '''
- Load and process the data
- > Separate positive and negative examples
- > Cut the two new sets in half
- > Concatenate to make two sets of equal pos/neg distributions
- > Standardize the data based off mean and std deviation of the train set
- > Shuffle the training and test sets
- '''
- print 'Loading data...'
- data = np.loadtxt('spambase.txt', delimiter=',')
- print 'Processing data...'
- pos = data[data[:,57] == 1]
- neg = data[data[:,57] == 0]
- pos1 = pos[:len(pos)/2]
- pos2 = pos[len(pos)/2:]
- neg1 = neg[:len(neg)/2]
- neg2 = neg[len(neg)/2:]
- train_set = np.vstack((pos1, neg1))
- test_set = np.vstack((pos2, neg2))
- scaler = preprocessing.StandardScaler().fit(train_set)
- train_set = scaler.transform(train_set)
- test_set = scaler.transform(test_set)
- np.random.shuffle(train_set)
- np.random.shuffle(test_set)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement