Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pandas import *
- from csv import DictReader
- import numpy
- from sklearn.preprocessing import StandardScaler
- from sklearn.svm import NuSVC
- from sklearn import metrics
- import random
- import csv
- list_of_features = [16, 17, 19, 23, 24, 25, 26, 27, 28, 35, 36, 37, 38, 39, 40,
- 89, 90, 91, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
- 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
- 156, 157, 158, 159, 160, 161, 162, 163]
- target_values_train = []
- target_values_validation = []
- id= []
- predictions = []
- list_of_df=[]
- list_of_targets=[]
- #Gettin the target values from train file and put them in a column numpy array
- with open('train.txt') as f:
- reader = DictReader(f, delimiter='\t')
- for row in reader:
- target_values_train.append(int(row['human-generated']))
- y_train = numpy.asarray(target_values_train)
- y_train = y_train[:, numpy.newaxis]
- scaler = StandardScaler()
- clf = NuSVC(probability= True) #SVM classifier
- #Reading the file with reader bench indexes( just the best 50 features)
- csv_reader = pandas.read_csv('train-indices.csv', iterator=True, chunksize=500000, delimiter=';', skiprows=1, usecols=list_of_features)
- df_train = pandas.concat(csv_reader, ignore_index=True) #Creating the dataframe
- df_train.info()
- #Getting a random 100k lines from dataset which will represent out train matrix for SVM
- shuffledRange = list(range(len(df_train)))
- random.shuffle(shuffledRange)
- df_train = numpy.nan_to_num(df_train)
- for i in range(0, 100000):
- list_of_df.append(df_train[shuffledRange[i]])
- list_of_targets.append(y_train[shuffledRange[i]])
- df_train_batch = numpy.array(list_of_df)
- y_train = numpy.array(list_of_targets)
- print(df_train_batch.shape)
- print(y_train.shape)
- scaler.fit(df_train)
- df_train_scaled = scaler.transform(df_train_batch) #Scaled data for traing
- clf.fit(df_train_scaled, y_train)
- '''for n in range(num_iter):
- random.shuffle(shuffledRange) # shuffle the indexes list
- #Shuffling the data
- shuffledX = [df_train_new[i] for i in shuffledRange]
- shuffledY = [y_train[i] for i in shuffledRange]
- print(len(shuffledX))
- print(len(shuffledY))
- clf.fit(shuffledX, shuffledY) # fitting the entire shuffled dataset'''
- with open('validation.txt') as f:
- reader = DictReader(f, delimiter='\t')
- for row in reader:
- target_values_validation.append(int(row['human-generated']))
- y_validation = numpy.asarray(target_values_validation)
- y_validation = y_validation[:, numpy.newaxis]
- csv_reader_validation = read_csv('dev-indices.csv', iterator=True, chunksize=100000, delimiter=';', usecols=list_of_features)
- df_validation = concat(csv_reader_validation, ignore_index=True)
- df_validation = numpy.nan_to_num(df_validation)
- df_validation_scaled = scaler.transform(df_validation) # Scaled validation data
- predicted = clf.predict_proba(df_validation_scaled) #Getting probabilities for each response to be human or machine
- print(predicted.shape)
- print(clf.classes_)
- #Getting the probabilities for response to be human (1) and write them in the csv submit file
- m, n = predicted.shape
- for j in range(0, m):
- predictions.append(predicted[j][1])
- for k in range(0, len(predictions)):
- id.append(k)
- with open("submit.csv", "a") as f:
- writer = csv.writer(f)
- for row in zip(id, predictions):
- writer.writerow(row)
- auc = metrics.roc_auc_score(y_validation, predictions)
- print(auc)
Advertisement
Add Comment
Please, Sign In to add comment