Guest User

SVM train

a guest
Sep 7th, 2017
108
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.44 KB | None | 0 0
  1. from pandas import *
  2. from csv import DictReader
  3. import numpy
  4. from sklearn.preprocessing import StandardScaler
  5. from sklearn.svm import NuSVC
  6. from sklearn import metrics
  7. import random
  8. import csv
  9.  
  10. list_of_features = [16, 17, 19, 23, 24, 25, 26, 27, 28, 35, 36, 37, 38, 39, 40,
  11.                     89, 90, 91, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
  12.                     143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
  13.                     156, 157, 158, 159, 160, 161, 162, 163]
  14. target_values_train = []
  15. target_values_validation = []
  16. id= []
  17. predictions = []
  18. list_of_df=[]
  19. list_of_targets=[]
  20. #Gettin the target values from train file and put them in a column numpy array
  21. with open('train.txt') as f:
  22.     reader = DictReader(f, delimiter='\t')
  23.     for row in reader:
  24.         target_values_train.append(int(row['human-generated']))
  25. y_train = numpy.asarray(target_values_train)
  26. y_train = y_train[:, numpy.newaxis]
  27.  
  28. scaler = StandardScaler()
  29. clf = NuSVC(probability= True) #SVM classifier
  30. #Reading the file with reader bench indexes( just the best 50 features)
  31. csv_reader = pandas.read_csv('train-indices.csv', iterator=True, chunksize=500000, delimiter=';', skiprows=1, usecols=list_of_features)
  32. df_train = pandas.concat(csv_reader, ignore_index=True) #Creating the dataframe
  33. df_train.info()
  34.  
  35. #Getting a random 100k lines from dataset which will represent out train matrix for SVM
  36. shuffledRange = list(range(len(df_train)))
  37. random.shuffle(shuffledRange)
  38. df_train = numpy.nan_to_num(df_train)
  39. for i in range(0, 100000):
  40.     list_of_df.append(df_train[shuffledRange[i]])
  41.     list_of_targets.append(y_train[shuffledRange[i]])
  42.  
  43. df_train_batch = numpy.array(list_of_df)
  44. y_train = numpy.array(list_of_targets)
  45.  
  46. print(df_train_batch.shape)
  47. print(y_train.shape)
  48. scaler.fit(df_train)
  49. df_train_scaled = scaler.transform(df_train_batch) #Scaled data for traing
  50. clf.fit(df_train_scaled, y_train)
  51. '''for n in range(num_iter):
  52.    random.shuffle(shuffledRange) # shuffle the indexes list
  53.    #Shuffling the data
  54.    shuffledX = [df_train_new[i] for i in shuffledRange]
  55.    shuffledY = [y_train[i] for i in shuffledRange]
  56.    print(len(shuffledX))
  57.    print(len(shuffledY))
  58.    clf.fit(shuffledX, shuffledY) # fitting the entire shuffled dataset'''
  59.  
  60. with open('validation.txt') as f:
  61.     reader = DictReader(f, delimiter='\t')
  62.     for row in reader:
  63.         target_values_validation.append(int(row['human-generated']))
  64.  
  65. y_validation = numpy.asarray(target_values_validation)
  66. y_validation = y_validation[:, numpy.newaxis]
  67.  
  68. csv_reader_validation = read_csv('dev-indices.csv', iterator=True, chunksize=100000, delimiter=';', usecols=list_of_features)
  69. df_validation = concat(csv_reader_validation, ignore_index=True)
  70. df_validation = numpy.nan_to_num(df_validation)
  71. df_validation_scaled = scaler.transform(df_validation) # Scaled validation data
  72.  
  73. predicted = clf.predict_proba(df_validation_scaled) #Getting probabilities for each response to be human or machine
  74. print(predicted.shape)
  75. print(clf.classes_)
  76. #Getting the probabilities for response to be human (1) and write them in the csv submit file
  77. m, n = predicted.shape
  78. for j in range(0, m):
  79.     predictions.append(predicted[j][1])
  80.  
  81. for k in range(0, len(predictions)):
  82.     id.append(k)
  83.  
  84. with open("submit.csv", "a") as f:
  85.     writer = csv.writer(f)
  86.     for row in zip(id, predictions):
  87.         writer.writerow(row)
  88.  
  89. auc = metrics.roc_auc_score(y_validation, predictions)
  90. print(auc)
Advertisement
Add Comment
Please, Sign In to add comment