Ledger Nano X - The secure hardware wallet
SHARE
TWEET

Untitled

a guest Mar 29th, 2020 75 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import numpy as np
  2. from sklearn.feature_extraction.text import TfidfVectorizer
  3. import re
  4.  
  5.  
  6. def read_line(data, x, cnt):
  7.     y = data.readline()
  8.     if len(y) != 0:
  9.         x.append(y)
  10.     return y
  11.  
  12.  
  13. f = open("train_samples.txt", "r", encoding='utf-8')
  14. if f.mode == 'r':
  15.     cnt = 0
  16.     train_samples = []
  17.     while read_line(f, train_samples, cnt):
  18.         cnt += 1
  19.  
  20. f = open("train_labels.txt", "r", encoding='utf-8')
  21. if f.mode == 'r':
  22.     cnt = 0
  23.     train_labels = []
  24.     while read_line(f, train_labels, cnt):
  25.         cnt += 1
  26.  
  27. f = open("validation_samples.txt", "r", encoding='utf-8')
  28. if f.mode == 'r':
  29.     cnt = 0
  30.     validation_samples = []
  31.     while read_line(f, validation_samples, cnt):
  32.         cnt += 1
  33.  
  34. f = open("validation_labels.txt", "r", encoding='utf-8')
  35. if f.mode == 'r':
  36.     cnt = 0
  37.     validation_labels = []
  38.     while read_line(f, validation_labels, cnt):
  39.         cnt += 1
  40.  
  41. f = open("test_samples.txt", "r", encoding='utf-8')
  42. if f.mode == 'r':
  43.     test_samples = []
  44.     while read_line(f, test_samples, cnt):
  45.         pass
  46.  
  47. id_list = []
  48.  
  49. # for idx in range(len(train_samples)):
  50. #     train_samples[idx] = train_samples[idx].replace('$NE$', '')
  51. #     #train_samples[idx] = re.sub('[0123456789!@+%#?,.:„”“";()]', '', train_samples[idx])
  52. #     # train_samples[idx] = train_samples[idx].split()
  53. #     # train_samples[idx] = list(
  54. #     #     filter(lambda x:  x != '.' and x.isdigit() == False and x not in cuvinte_stop, train_samples[idx]))
  55.  
  56. # train_labels parsing
  57. for idx in range(len(train_labels)):
  58.     train_labels[idx] = train_labels[idx].split()
  59.     train_labels[idx].pop(0)
  60.  
  61. aux = []
  62. for idx in range(len(train_labels)):
  63.     aux.append(train_labels[idx])
  64.  
  65. train_labels = aux
  66.  
  67. # for idx in range(len(validation_samples)):
  68. #    validation_samples[idx]= validation_samples[idx].replace('$NE$', '')
  69. #    #validation_target_samples[idx] = re.sub('[0123456789!@+%#?,.:„”“";()]', '', validation_target_samples[idx])
  70. #   #  validation_target_samples[idx] = validation_target_samples[idx].split()
  71. #   #  validation_target_samples[idx] = list(filter(lambda x : x != '.' and x.isdigit() == False and x not in cuvinte_stop, validation_target_samples[idx]))
  72.  
  73.  
  74. for idx in range(len(validation_labels)):
  75.     validation_labels[idx] = validation_labels[idx].split()
  76.     validation_labels[idx].pop(0)
  77.  
  78. aux = []
  79. for idx in range(len(validation_labels)):
  80.     aux.append(validation_labels[idx])
  81.  
  82. validation_labels = aux
  83.  
  84. print(len(test_samples))
  85. for idx in range(len(test_samples)):
  86.     # test_samples[idx] = test_samples[idx].replace('$NE$', '')
  87.     line = test_samples[idx].split()
  88.     id_list.append(line[0])
  89.     # test_samples[idx] = re.sub('[0123456789!@+%#?,.:„”“";()]', '', test_samples[idx])
  90.     # test_samples[idx] = test_samples[idx].split()
  91.     # test_samples[idx] = list(filter(lambda x: x != '-' and x != '.' and x.isdigit() == False and x not in cuvinte_stop, test_samples[idx]))
  92.  
  93. for idx in range(len(validation_samples)):
  94.     train_samples.append(validation_samples[idx])
  95.  
  96. for idx in range(len(validation_labels)):
  97.     train_labels.append(validation_labels[idx])
  98.  
  99.  
  100. tf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))
  101. train_samples_tfidf = tf_vectorizer.fit_transform(train_samples)
  102. print(train_samples_tfidf.shape)
  103. # validation_samples_tfidf = tf_vectorizer.transform(validation_samples)
  104. test_samples_tfidf = tf_vectorizer.transform(test_samples)
  105.  
  106. from sklearn import preprocessing
  107. from sklearn import svm
  108. from sklearn.svm import LinearSVC
  109. from sklearn import linear_model
  110. from sklearn.metrics import f1_score
  111. from sklearn.metrics import accuracy_score
  112.  
  113. import csv
  114.  
  115. # print(accuracy_score(predicted_labels, validation_source_labels))
  116. print("gata")
  117.  
  118. # print(f1_score(predicted_labels, validation_target_labels))
  119.  
  120.  
  121. C_param = 1
  122.  
  123. # svm_model = svm.LinearSVC(C=c, soft_marg)  # kernel liniar
  124. # svm_model.fit(train_samples_tfidf, np.ravel(train_labels))  # train
  125.  
  126. Loss_param = ['hinge']  #, 'log', 'modified_huber', 'squared_hinge', 'perceptron','squared_loss', 'huber', 'epsilon_insensitive',  'squared_epsilon_insensitive'
  127. Penalty_param = ['l2', 'l1', 'elasticnet']
  128. Alpha_param = [0.0001, 0.001, 0.00001, 0.000001, 0.000005]
  129. Fit_intercept_param = [True] #, False
  130. Max_iter_param = [1000]
  131. Tol_param = [1e-3, 1e-4, 1e-2]
  132. Shuffle_param = [True] #, False
  133. Verbose_param = [0]
  134. Epsilon_param = [0.1, 0.5, 0.05, 0.005]  #
  135. Learning_rate_param = ['optimal']
  136. Eta0_param = [0, 1, 5, 20]  #
  137. Power_t_param = [0.5, 0.1, 0.7, 1]
  138. Average_param = [False]  #  True,  5, 10, 100
  139.  
  140. maxim = 0
  141. clf_maxim = 0
  142. loss_max = 0
  143. penalty_max = 0
  144. alpha_max = 0
  145. fit_intercept_max = 0
  146. max_iter_max = 0
  147. tol_max = 0
  148. shuffle_max = 0
  149. verbose_max = 0
  150. epsilon_max = 0
  151. learning_rate_max = 0
  152. eta0_max = 0
  153. power_t_max = 0
  154. average_max = 0
  155.  
  156. nr_iteratii = 0
  157.  
  158. # for loss_param in Loss_param:
  159. #     if nr_iteratii == 10000:
  160. #         break;
  161. #     for penalty_param in Penalty_param:
  162. #         if nr_iteratii == 10000:
  163. #             break;
  164. #         for alpha_param in Alpha_param:
  165. #             if nr_iteratii == 10000:
  166. #                 break;
  167. #             for fit_intercept_param in Fit_intercept_param:
  168. #                 if nr_iteratii == 10000:
  169. #                     break;
  170. #                 for max_iter_param in Max_iter_param:
  171. #                     if nr_iteratii == 10000:
  172. #                         break;
  173. #                     for tol_param in Tol_param:
  174. #                         if nr_iteratii == 10000:
  175. #                             break;
  176. #                         for shuffle_param in Shuffle_param:
  177. #                             if nr_iteratii == 10000:
  178. #                                 break;
  179. #                             for verbose_param in Verbose_param:
  180. #                                 if nr_iteratii == 10000:
  181. #                                     break;
  182. #                                 for epsilon_param in Epsilon_param:
  183. #                                     if nr_iteratii == 10000:
  184. #                                         break;
  185. #                                     for learning_rate_param in Learning_rate_param:
  186. #                                         if nr_iteratii == 10000:
  187. #                                             break;
  188. #                                         for eta0_param in Eta0_param:
  189. #                                             if nr_iteratii == 10000:
  190. #                                                 break;
  191. #                                             for power_t_param in Power_t_param:
  192. #                                                 if nr_iteratii == 10000:
  193. #                                                     break;
  194. #                                                 for average_param in Average_param:
  195. #                                                     if nr_iteratii == 10000:
  196. #                                                         break;
  197. #                                                     clf = linear_model.SGDClassifier(loss=loss_param,
  198. #                                                                                      penalty=penalty_param,
  199. #                                                                                      alpha=alpha_param,
  200. #                                                                                      fit_intercept=fit_intercept_param,
  201. #                                                                                      max_iter=max_iter_param,
  202. #                                                                                      tol=tol_param,
  203. #                                                                                      shuffle=shuffle_param,
  204. #                                                                                      verbose=verbose_param,
  205. #                                                                                      epsilon=epsilon_param,
  206. #                                                                                      learning_rate=learning_rate_param,
  207. #                                                                                      eta0=eta0_param,
  208. #                                                                                      power_t=power_t_param,
  209. #                                                                                      average=average_param)
  210. #                                                     clf.fit(train_samples_tfidf, np.ravel(train_labels))
  211. #                                                     predicted_labels = clf.predict(validation_samples_tfidf)
  212. #                                                     nr_iteratii += 1
  213. #                                                     if nr_iteratii % 1000 == 0:
  214. #                                                         print(nr_iteratii)
  215. #                                                     if accuracy_score(predicted_labels, validation_labels) > maxim:
  216. #                                                         maxim = accuracy_score(predicted_labels, validation_labels)
  217. #                                                         clf_maxim = clf
  218. #                                                         loss_max = loss_param
  219. #                                                         penalty_max = penalty_param
  220. #                                                         alpha_max = alpha_param
  221. #                                                         fit_intercept_max = fit_intercept_param
  222. #                                                         max_iter_max = max_iter_param
  223. #                                                         tol_max = tol_param
  224. #                                                         shuffle_max = shuffle_param
  225. #                                                         verbose_max = verbose_param
  226. #                                                         epsilon_max = epsilon_param
  227. #                                                         learning_rate_max = learning_rate_param
  228. #                                                         eta0_max = eta0_param
  229. #                                                         power_t_max = power_t_param
  230. #                                                         average_max = average_param
  231. #                                                         print(maxim)
  232.  
  233. # print(maxim)
  234. # print(loss_max)
  235. # print(penalty_max)
  236. # print(alpha_max)
  237. # print(fit_intercept_max)
  238. # print(max_iter_max)
  239. # print(tol_max)
  240. # print(shuffle_max)
  241. # print(verbose_max)
  242. # print(epsilon_max)
  243. # print(learning_rate_max)
  244. # print(eta0_max)
  245. # print(power_t_max)
  246. # print(average_max)
  247.  
  248. #clf = linear_model.SGDClassifier(loss='hinge', penalty='elasticnet', alpha=1e-05, fit_intercept=True, max_iter=1000, tol=0.01, shuffle=True, verbose=0, epsilon=0.005, learning_rate='optimal', eta0=0, power_t=0.1, average=False)
  249. #clf = linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-05, fit_intercept=True, max_iter=1000, tol=0.0001, shuffle=True, verbose=0, epsilon=0.005, learning_rate='optimal', eta0=0, power_t=0.5, average=False)
  250. # clf = linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-05, fit_intercept=True, max_iter=1000, tol=0.0001, shuffle=True, verbose=0, epsilon=0.5, learning_rate='optimal', eta0=0, power_t=0.5, average=False)
  251. clf = linear_model.SGDClassifier()
  252. clf.fit(train_samples_tfidf, np.ravel(train_labels))
  253.  
  254. predicted_labels = clf.predict(test_samples_tfidf)
  255. # print(accuracy_score(predicted_labels, validation_labels))
  256. with open('sample_submission.csv', 'w', newline='') as file:
  257.     writer = csv.writer(file, delimiter=',')
  258.     writer.writerow(["id", "label"])
  259.     for idx in range(len(predicted_labels)):
  260.         writer.writerow([id_list[idx], predicted_labels[idx]])
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Top