Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- from sklearn.ensemble import RandomForestClassifier, \
- GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.metrics import r2_score, accuracy_score, f1_score
- from sklearn.model_selection import train_test_split
- from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
- from CustomTokenizer import CustomTokenizer
- jobz=1
- file=open('dupa_scared_asf','w')
- def getTrees():
- return [DecisionTreeClassifier(max_depth=1000,max_features='sqrt'),
- RandomForestClassifier(max_features=100,min_samples_leaf=1,min_samples_split=2,max_depth=90,n_estimators=100,n_jobs=jobz),
- ExtraTreesClassifier(max_features=200,min_samples_leaf=2,min_samples_split=5,max_depth=75,n_estimators=100,n_jobs=jobz),
- AdaBoostClassifier(n_estimators=125,learning_rate=0.01,base_estimator=DecisionTreeClassifier(max_depth=200)),
- GradientBoostingClassifier(learning_rate=0.1,loss='deviance',max_depth=200,max_features='log2',max_leaf_nodes=300,min_samples_leaf=3,min_samples_split=2,n_estimators=350,subsample=0.7,warm_start=False,tol=1e-3)]
- def getVec1():
- return TfidfVectorizer(lowercase=True, tokenizer=CustomTokenizer(), analyzer='word', stop_words=None, ngram_range=(1, 1),use_idf=True, smooth_idf=True, sublinear_tf=False)
- def getVec2():
- return TfidfVectorizer(lowercase=True, tokenizer=CustomTokenizer(), analyzer='word', stop_words=None, ngram_range=(1, 2),use_idf=True, smooth_idf=True, sublinear_tf=False)
- def outputData(acc,f1):
- file.write('ACCURACY:\n')
- file.write(str(acc))
- file.write('\nF1:\n')
- file.write(str(f1))
- file.write('\n')
- science = pd.read_csv("./../computed_science_data.csv", encoding='ISO-8859-1').sample(frac=1).reset_index(drop=True).head(100)
- data_x = science['body']
- data_y = science['removed']
- accuracy_array=[0.0,0.0,0.0,0.0,0.0]
- f1_array=[0.0,0.0,0.0,0.0,0.0]
- for i in range(0,3):
- tree_array=getTrees();
- vec=getVec1()
- x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.33)
- train_trans=vec.fit_transform(x_train)
- test_trans=vec.transform(x_test)
- for j in range(0,len(tree_array)):
- tree=tree_array[j]
- tree.fit(train_trans,y_train)
- predicted_values=tree.predict(test_trans)
- accuracy_array[j]+=accuracy_score(y_test,predicted_values)
- f1_array[j]+=f1_score(y_test,predicted_values)
- for i in range(0,5):
- accuracy_array[i]/=3.0
- f1_array[i]/=3.0
- outputData(accuracy_array,f1_array)
- accuracy_array2=[0.0,0.0,0.0,0.0,0.0]
- f1_array2=[0.0,0.0,0.0,0.0,0.0]
- for i in range(0,3):
- tree_array=getTrees();
- vec=getVec2()
- x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.33)
- train_trans=vec.fit_transform(x_train)
- test_trans=vec.transform(x_test)
- for j in range(0,len(tree_array)):
- tree=tree_array[j]
- tree.fit(train_trans,y_train)
- predicted_values=tree.predict(test_trans)
- accuracy_array2[j]+=accuracy_score(y_test,predicted_values)
- f1_array2[j]+=f1_score(y_test,predicted_values)
- for i in range(0,5):
- accuracy_array2[i]/=3.0
- f1_array2[i]/=3.0
- outputData(accuracy_array2,f1_array2)
- file.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement