PrepData.py

#!/usr/bin/python
#import cgitb
#This script will extract the data from the .arff files and put the features in dataX and the score or the value to be predicted in dataY.
import numpy as np
import scipy
#import arff
import os
#from matplotlib import pyplot as plt
from sklearn.svm import NuSVR
from sklearn.preprocessing import StandardScaler
from scipy import sparse
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from time import time,clock
from time import gmtime, strftime
import shlex

#fname = './Vowels_To_UnvoicedFricatives.arff' # Comment this line and remove the comment from the next to change the datafile.
fname = './Vowels_To_Nasals.arff'

f = open(fname,'r')
lines = f.readlines()[42:]
f.close()

floats = []
for line in lines:
    floats.append(shlex.split(line))
array = np.asarray(floats)

for (x,y), value in np.ndenumerate(array): # To remove NaNs from the data
    if value == 'NaN':
        array[x][y] = 0;
array = array.astype(np.float)


#scale = StandardScaler()
#array = scale.fit_transform(array) # A scaling of all the data takes place here
dataY = array[:,38]
dataX = np.delete(array, [36,37,38,39],1)

print 'dataX size'
print np.shape(dataX)

print '\ndataY size'
print np.shape(dataY)