Advertisement
Guest User

Untitled

a guest
Mar 29th, 2020
129
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.58 KB | None | 0 0
  1. import numpy as np
  2. from sklearn.feature_extraction.text import TfidfVectorizer
  3. import re
  4.  
  5.  
  6. def read_line(data, x, cnt):
  7. y = data.readline()
  8. if len(y) != 0:
  9. x.append(y)
  10. return y
  11.  
  12.  
  13. f = open("train_samples.txt", "r", encoding='utf-8')
  14. if f.mode == 'r':
  15. cnt = 0
  16. train_samples = []
  17. while read_line(f, train_samples, cnt):
  18. cnt += 1
  19.  
  20. f = open("train_labels.txt", "r", encoding='utf-8')
  21. if f.mode == 'r':
  22. cnt = 0
  23. train_labels = []
  24. while read_line(f, train_labels, cnt):
  25. cnt += 1
  26.  
  27. f = open("validation_samples.txt", "r", encoding='utf-8')
  28. if f.mode == 'r':
  29. cnt = 0
  30. validation_samples = []
  31. while read_line(f, validation_samples, cnt):
  32. cnt += 1
  33.  
  34. f = open("validation_labels.txt", "r", encoding='utf-8')
  35. if f.mode == 'r':
  36. cnt = 0
  37. validation_labels = []
  38. while read_line(f, validation_labels, cnt):
  39. cnt += 1
  40.  
  41. f = open("test_samples.txt", "r", encoding='utf-8')
  42. if f.mode == 'r':
  43. test_samples = []
  44. while read_line(f, test_samples, cnt):
  45. pass
  46.  
  47. id_list = []
  48.  
  49. # for idx in range(len(train_samples)):
  50. # train_samples[idx] = train_samples[idx].replace('$NE$', '')
  51. # #train_samples[idx] = re.sub('[0123456789!@+%#?,.:„”“";()]', '', train_samples[idx])
  52. # # train_samples[idx] = train_samples[idx].split()
  53. # # train_samples[idx] = list(
  54. # # filter(lambda x: x != '.' and x.isdigit() == False and x not in cuvinte_stop, train_samples[idx]))
  55.  
  56. # train_labels parsing
  57. for idx in range(len(train_labels)):
  58. train_labels[idx] = train_labels[idx].split()
  59. train_labels[idx].pop(0)
  60.  
  61. aux = []
  62. for idx in range(len(train_labels)):
  63. aux.append(train_labels[idx])
  64.  
  65. train_labels = aux
  66.  
  67. # for idx in range(len(validation_samples)):
  68. # validation_samples[idx]= validation_samples[idx].replace('$NE$', '')
  69. # #validation_target_samples[idx] = re.sub('[0123456789!@+%#?,.:„”“";()]', '', validation_target_samples[idx])
  70. # # validation_target_samples[idx] = validation_target_samples[idx].split()
  71. # # validation_target_samples[idx] = list(filter(lambda x : x != '.' and x.isdigit() == False and x not in cuvinte_stop, validation_target_samples[idx]))
  72.  
  73.  
  74. for idx in range(len(validation_labels)):
  75. validation_labels[idx] = validation_labels[idx].split()
  76. validation_labels[idx].pop(0)
  77.  
  78. aux = []
  79. for idx in range(len(validation_labels)):
  80. aux.append(validation_labels[idx])
  81.  
  82. validation_labels = aux
  83.  
  84. print(len(test_samples))
  85. for idx in range(len(test_samples)):
  86. # test_samples[idx] = test_samples[idx].replace('$NE$', '')
  87. line = test_samples[idx].split()
  88. id_list.append(line[0])
  89. # test_samples[idx] = re.sub('[0123456789!@+%#?,.:„”“";()]', '', test_samples[idx])
  90. # test_samples[idx] = test_samples[idx].split()
  91. # test_samples[idx] = list(filter(lambda x: x != '-' and x != '.' and x.isdigit() == False and x not in cuvinte_stop, test_samples[idx]))
  92.  
  93. for idx in range(len(validation_samples)):
  94. train_samples.append(validation_samples[idx])
  95.  
  96. for idx in range(len(validation_labels)):
  97. train_labels.append(validation_labels[idx])
  98.  
  99.  
  100. tf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))
  101. train_samples_tfidf = tf_vectorizer.fit_transform(train_samples)
  102. print(train_samples_tfidf.shape)
  103. # validation_samples_tfidf = tf_vectorizer.transform(validation_samples)
  104. test_samples_tfidf = tf_vectorizer.transform(test_samples)
  105.  
  106. from sklearn import preprocessing
  107. from sklearn import svm
  108. from sklearn.svm import LinearSVC
  109. from sklearn import linear_model
  110. from sklearn.metrics import f1_score
  111. from sklearn.metrics import accuracy_score
  112.  
  113. import csv
  114.  
  115. # print(accuracy_score(predicted_labels, validation_source_labels))
  116. print("gata")
  117.  
  118. # print(f1_score(predicted_labels, validation_target_labels))
  119.  
  120.  
  121. C_param = 1
  122.  
  123. # svm_model = svm.LinearSVC(C=c, soft_marg) # kernel liniar
  124. # svm_model.fit(train_samples_tfidf, np.ravel(train_labels)) # train
  125.  
  126. Loss_param = ['hinge'] #, 'log', 'modified_huber', 'squared_hinge', 'perceptron','squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'
  127. Penalty_param = ['l2', 'l1', 'elasticnet']
  128. Alpha_param = [0.0001, 0.001, 0.00001, 0.000001, 0.000005]
  129. Fit_intercept_param = [True] #, False
  130. Max_iter_param = [1000]
  131. Tol_param = [1e-3, 1e-4, 1e-2]
  132. Shuffle_param = [True] #, False
  133. Verbose_param = [0]
  134. Epsilon_param = [0.1, 0.5, 0.05, 0.005] #
  135. Learning_rate_param = ['optimal']
  136. Eta0_param = [0, 1, 5, 20] #
  137. Power_t_param = [0.5, 0.1, 0.7, 1]
  138. Average_param = [False] # True, 5, 10, 100
  139.  
  140. maxim = 0
  141. clf_maxim = 0
  142. loss_max = 0
  143. penalty_max = 0
  144. alpha_max = 0
  145. fit_intercept_max = 0
  146. max_iter_max = 0
  147. tol_max = 0
  148. shuffle_max = 0
  149. verbose_max = 0
  150. epsilon_max = 0
  151. learning_rate_max = 0
  152. eta0_max = 0
  153. power_t_max = 0
  154. average_max = 0
  155.  
  156. nr_iteratii = 0
  157.  
  158. # for loss_param in Loss_param:
  159. # if nr_iteratii == 10000:
  160. # break;
  161. # for penalty_param in Penalty_param:
  162. # if nr_iteratii == 10000:
  163. # break;
  164. # for alpha_param in Alpha_param:
  165. # if nr_iteratii == 10000:
  166. # break;
  167. # for fit_intercept_param in Fit_intercept_param:
  168. # if nr_iteratii == 10000:
  169. # break;
  170. # for max_iter_param in Max_iter_param:
  171. # if nr_iteratii == 10000:
  172. # break;
  173. # for tol_param in Tol_param:
  174. # if nr_iteratii == 10000:
  175. # break;
  176. # for shuffle_param in Shuffle_param:
  177. # if nr_iteratii == 10000:
  178. # break;
  179. # for verbose_param in Verbose_param:
  180. # if nr_iteratii == 10000:
  181. # break;
  182. # for epsilon_param in Epsilon_param:
  183. # if nr_iteratii == 10000:
  184. # break;
  185. # for learning_rate_param in Learning_rate_param:
  186. # if nr_iteratii == 10000:
  187. # break;
  188. # for eta0_param in Eta0_param:
  189. # if nr_iteratii == 10000:
  190. # break;
  191. # for power_t_param in Power_t_param:
  192. # if nr_iteratii == 10000:
  193. # break;
  194. # for average_param in Average_param:
  195. # if nr_iteratii == 10000:
  196. # break;
  197. # clf = linear_model.SGDClassifier(loss=loss_param,
  198. # penalty=penalty_param,
  199. # alpha=alpha_param,
  200. # fit_intercept=fit_intercept_param,
  201. # max_iter=max_iter_param,
  202. # tol=tol_param,
  203. # shuffle=shuffle_param,
  204. # verbose=verbose_param,
  205. # epsilon=epsilon_param,
  206. # learning_rate=learning_rate_param,
  207. # eta0=eta0_param,
  208. # power_t=power_t_param,
  209. # average=average_param)
  210. # clf.fit(train_samples_tfidf, np.ravel(train_labels))
  211. # predicted_labels = clf.predict(validation_samples_tfidf)
  212. # nr_iteratii += 1
  213. # if nr_iteratii % 1000 == 0:
  214. # print(nr_iteratii)
  215. # if accuracy_score(predicted_labels, validation_labels) > maxim:
  216. # maxim = accuracy_score(predicted_labels, validation_labels)
  217. # clf_maxim = clf
  218. # loss_max = loss_param
  219. # penalty_max = penalty_param
  220. # alpha_max = alpha_param
  221. # fit_intercept_max = fit_intercept_param
  222. # max_iter_max = max_iter_param
  223. # tol_max = tol_param
  224. # shuffle_max = shuffle_param
  225. # verbose_max = verbose_param
  226. # epsilon_max = epsilon_param
  227. # learning_rate_max = learning_rate_param
  228. # eta0_max = eta0_param
  229. # power_t_max = power_t_param
  230. # average_max = average_param
  231. # print(maxim)
  232.  
  233. # print(maxim)
  234. # print(loss_max)
  235. # print(penalty_max)
  236. # print(alpha_max)
  237. # print(fit_intercept_max)
  238. # print(max_iter_max)
  239. # print(tol_max)
  240. # print(shuffle_max)
  241. # print(verbose_max)
  242. # print(epsilon_max)
  243. # print(learning_rate_max)
  244. # print(eta0_max)
  245. # print(power_t_max)
  246. # print(average_max)
  247.  
  248. #clf = linear_model.SGDClassifier(loss='hinge', penalty='elasticnet', alpha=1e-05, fit_intercept=True, max_iter=1000, tol=0.01, shuffle=True, verbose=0, epsilon=0.005, learning_rate='optimal', eta0=0, power_t=0.1, average=False)
  249. #clf = linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-05, fit_intercept=True, max_iter=1000, tol=0.0001, shuffle=True, verbose=0, epsilon=0.005, learning_rate='optimal', eta0=0, power_t=0.5, average=False)
  250. # clf = linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-05, fit_intercept=True, max_iter=1000, tol=0.0001, shuffle=True, verbose=0, epsilon=0.5, learning_rate='optimal', eta0=0, power_t=0.5, average=False)
  251. clf = linear_model.SGDClassifier()
  252. clf.fit(train_samples_tfidf, np.ravel(train_labels))
  253.  
  254. predicted_labels = clf.predict(test_samples_tfidf)
  255. # print(accuracy_score(predicted_labels, validation_labels))
  256. with open('sample_submission.csv', 'w', newline='') as file:
  257. writer = csv.writer(file, delimiter=',')
  258. writer.writerow(["id", "label"])
  259. for idx in range(len(predicted_labels)):
  260. writer.writerow([id_list[idx], predicted_labels[idx]])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement