Advertisement
Guest User

Untitled

a guest
Dec 16th, 2018
58
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.29 KB | None | 0 0
  1. import pandas as pd
  2. from sklearn.ensemble import RandomForestClassifier, \
  3. GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
  4. from sklearn.feature_extraction.text import TfidfVectorizer
  5. from sklearn.metrics import r2_score, accuracy_score, f1_score
  6. from sklearn.model_selection import train_test_split
  7. from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
  8.  
  9. from CustomTokenizer import CustomTokenizer
  10.  
  11. jobz=1
  12. file=open('dupa_scared_asf','w')
  13.  
  14. def getTrees():
  15. return [DecisionTreeClassifier(max_depth=1000,max_features='sqrt'),
  16. RandomForestClassifier(max_features=100,min_samples_leaf=1,min_samples_split=2,max_depth=90,n_estimators=100,n_jobs=jobz),
  17. ExtraTreesClassifier(max_features=200,min_samples_leaf=2,min_samples_split=5,max_depth=75,n_estimators=100,n_jobs=jobz),
  18. AdaBoostClassifier(n_estimators=125,learning_rate=0.01,base_estimator=DecisionTreeClassifier(max_depth=200)),
  19. GradientBoostingClassifier(learning_rate=0.1,loss='deviance',max_depth=200,max_features='log2',max_leaf_nodes=300,min_samples_leaf=3,min_samples_split=2,n_estimators=350,subsample=0.7,warm_start=False,tol=1e-3)]
  20.  
  21. def getVec1():
  22. return TfidfVectorizer(lowercase=True, tokenizer=CustomTokenizer(), analyzer='word', stop_words=None, ngram_range=(1, 1),use_idf=True, smooth_idf=True, sublinear_tf=False)
  23.  
  24. def getVec2():
  25. return TfidfVectorizer(lowercase=True, tokenizer=CustomTokenizer(), analyzer='word', stop_words=None, ngram_range=(1, 2),use_idf=True, smooth_idf=True, sublinear_tf=False)
  26.  
  27. def outputData(acc,f1):
  28. file.write('ACCURACY:\n')
  29. file.write(str(acc))
  30. file.write('\nF1:\n')
  31. file.write(str(f1))
  32. file.write('\n')
  33.  
  34.  
  35. science = pd.read_csv("./../computed_science_data.csv", encoding='ISO-8859-1').sample(frac=1).reset_index(drop=True).head(100)
  36. data_x = science['body']
  37. data_y = science['removed']
  38.  
  39. accuracy_array=[0.0,0.0,0.0,0.0,0.0]
  40. f1_array=[0.0,0.0,0.0,0.0,0.0]
  41. for i in range(0,3):
  42. tree_array=getTrees();
  43. vec=getVec1()
  44. x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.33)
  45. train_trans=vec.fit_transform(x_train)
  46. test_trans=vec.transform(x_test)
  47. for j in range(0,len(tree_array)):
  48. tree=tree_array[j]
  49. tree.fit(train_trans,y_train)
  50. predicted_values=tree.predict(test_trans)
  51. accuracy_array[j]+=accuracy_score(y_test,predicted_values)
  52. f1_array[j]+=f1_score(y_test,predicted_values)
  53.  
  54. for i in range(0,5):
  55. accuracy_array[i]/=3.0
  56. f1_array[i]/=3.0
  57. outputData(accuracy_array,f1_array)
  58.  
  59. accuracy_array2=[0.0,0.0,0.0,0.0,0.0]
  60. f1_array2=[0.0,0.0,0.0,0.0,0.0]
  61. for i in range(0,3):
  62. tree_array=getTrees();
  63. vec=getVec2()
  64. x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.33)
  65. train_trans=vec.fit_transform(x_train)
  66. test_trans=vec.transform(x_test)
  67. for j in range(0,len(tree_array)):
  68. tree=tree_array[j]
  69. tree.fit(train_trans,y_train)
  70. predicted_values=tree.predict(test_trans)
  71. accuracy_array2[j]+=accuracy_score(y_test,predicted_values)
  72. f1_array2[j]+=f1_score(y_test,predicted_values)
  73.  
  74. for i in range(0,5):
  75. accuracy_array2[i]/=3.0
  76. f1_array2[i]/=3.0
  77. outputData(accuracy_array2,f1_array2)
  78.  
  79. file.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement