Advertisement
Guest User

Untitled

a guest
Jan 21st, 2019
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.25 KB | None | 0 0
  1. import re
  2. import fastText
  3. import morfeusz2
  4. import pandas as pd
  5. import numpy as np
  6. from tqdm import tqdm
  7. import unicodedata
  8. from skmultilearn.adapt import MLkNN
  9. from sklearn.feature_extraction.text import TfidfVectorizer
  10. from scipy.sparse import csc_matrix
  11. import os.path
  12. import operator
  13. import time
  14. import io
  15. import csv
  16.  
  17.  
  18. def dummy_fun(doc):
  19. return doc
  20.  
  21.  
  22. def main():
  23. prepare_train_data()
  24. prepare_test_data()
  25. train_model()
  26. test = get_test_data()
  27. test_data = get_test_data_ft()
  28. predicted = predict_test(test_data)
  29. # for pred in predicted:
  30. # print(pred)
  31.  
  32. with io.open('eggs.csv', 'w', encoding="utf-8") as file:
  33. for id, labels in zip(test["id"], predicted):
  34. labels = labels_to_string(labels)
  35. file.write(str(id) + "," + labels + "\n")
  36. print(str(id) + ", " + labels)
  37. # for labels in enumerate(results):
  38. # print(labels_to_string(labels))
  39.  
  40.  
  41. def get_stopwords():
  42. with open('data/stopwords.txt') as f:
  43. stopwords = f.readlines()
  44. stopwords = [x.strip() for x in stopwords]
  45. return stopwords
  46.  
  47.  
  48. def get_train_data():
  49. train_data = pd.read_csv('data/dataninja2019_ads_train.csv')
  50. return train_data
  51.  
  52.  
  53. def get_test_data():
  54. test_data = pd.read_csv('data/dataninja2019_ads_test.csv')
  55. return test_data
  56.  
  57.  
  58. def get_test_data_ft(file_name='data/FastText_test.txt'):
  59. with open(file_name, 'r', encoding='utf8') as f:
  60. return f.readlines()
  61.  
  62.  
  63. def train_model():
  64. model = fastText.train_supervised('data/FastText_train.txt', epoch=50, lr=0.02, wordNgrams=2, minCount=2,
  65. label='__label__', loss='hs')
  66. model.save_model('model.bin')
  67.  
  68.  
  69. def prepare_train_data():
  70. stopwords = get_stopwords()
  71. morfeusz = morfeusz2.Morfeusz()
  72. X = get_train_data()[['title', 'description', 'labels']]
  73. file = open("data/FastText_train.txt", "w", encoding='utf-8')
  74. for index, row in tqdm(X.iterrows()):
  75. text = []
  76. if row['labels'] is not None:
  77. for label in str(row['labels']).split():
  78. text.append(''.join(['__label__', str(label), ' , ']))
  79. if row['title'] is not None:
  80. text.append(tokenize(row['title'], morfeusz, stopwords))
  81. if row['description'] is not None:
  82. text.append(tokenize(row['description'], morfeusz, stopwords))
  83. text.append('\n')
  84. file.write(str(" ".join(text)))
  85. file.flush()
  86. file.close()
  87.  
  88.  
  89. def prepare_test_data():
  90. stopwords = get_stopwords()
  91. morfeusz = morfeusz2.Morfeusz()
  92. X = get_test_data()[['title', 'description']]
  93. file = open("data/FastText_test.txt", "w", encoding='utf8')
  94. for index, row in tqdm(X.iterrows()):
  95. text = []
  96. if row['title'] is not None:
  97. text.append(tokenize(row['title'], morfeusz, stopwords))
  98. if row['description'] is not None:
  99. text.append(tokenize(row['description'], morfeusz, stopwords))
  100. text.append('\n')
  101. file.write(str(" ".join(text)))
  102. file.flush()
  103. file.close()
  104.  
  105.  
  106. def predict_test(test_texts):
  107. model = fastText.load_model('model.bin')
  108. results = []
  109. print('test text len ', len(test_texts))
  110. for text in tqdm(test_texts):
  111. tmp = model.predict(str(text).replace('\n', ''), k=5)[0]
  112. results.append(tmp)
  113. return results
  114.  
  115.  
  116. def labels_to_string(labels):
  117. new_labels = []
  118. for label in labels:
  119. new_labels.append(str(label).replace('__label__', ''))
  120. return " ".join(new_labels)
  121.  
  122.  
  123. def labelise(dataset):
  124. Y = []
  125. labelByAd = {}
  126. label_dict = {}
  127. listOfLabels = []
  128. iterator = 0
  129. # For each value with the tag 'labels' an entry is added
  130. # to the dictionary with the number of occurrences of the given tag.
  131. # If the entry contains more than one label then the string is split.
  132. for label in tqdm(dataset['labels'].values):
  133. label = str(label) # We change everything into a string
  134. # Dividing the string into individual labels
  135. single_labels = label.split()
  136. listOfLabels.append(single_labels)
  137. labelByAd[iterator] = single_labels
  138. iterator = iterator + 1
  139. if label.find(' ') != -1:
  140. single_labels = label.split()
  141. for i in range(len(single_labels)):
  142. if single_labels[i] in label_dict:
  143. label_dict[single_labels[i]] += 1
  144. else:
  145. label_dict[single_labels[i]] = 1
  146. else:
  147. if label in label_dict:
  148. label_dict[label] += 1
  149. else:
  150. label_dict[label] = 1
  151.  
  152. return label_dict, labelByAd, listOfLabels
  153.  
  154.  
  155. def generateY(dataset, labels):
  156. X_s = []
  157. Y_s = []
  158.  
  159. for i, word in tqdm(enumerate(dataset['labels'])):
  160. word = str(word).split()
  161. for j, label in enumerate(labels):
  162. if label in word:
  163. X_s.append(i)
  164. Y_s.append(j)
  165. Y = csc_matrix((np.ones(len(Y_s)), (X_s, Y_s)), dtype=np.int_)
  166. print(Y)
  167. return Y
  168.  
  169.  
  170. def normalize_dataset(dataset):
  171. with open('data/stopwords.txt', encoding="utf-8") as f:
  172. stopwords = f.readlines()
  173. stopwords = [x.strip() for x in stopwords]
  174. stopwords = lemmatise_stopwords(stopwords)
  175. for i in tqdm(dataset.index):
  176. value = dataset.at[i, 'description']
  177. description_array = re.split(r'\W+', value.lower())
  178. description_array = lemmatise(description_array)
  179. description_array = remove_stopwords(stopwords, description_array)
  180. description_array = tokenize(description_array)
  181. dataset.at[i, 'description'] = description_array
  182. # X.append(description_array)
  183. return dataset['description']
  184.  
  185.  
  186. def tokenize(text, morfeusz, stopwords):
  187. words = tmp = re.split(r'\W+', str(text).lower())
  188. tokens = []
  189. for word in words:
  190. if word != ' ' and word != '':
  191. word = morfeusz.analyse(word)[0][2][1]
  192. word = unicodedata.normalize('NFKD', word).replace(u'ł', 'l').encode('ascii', 'ignore').decode("utf-8")
  193. if word not in stopwords and word != '':
  194. tokens.append(word)
  195. return " ".join(tokens)
  196.  
  197.  
  198. morf = morfeusz2.Morfeusz()
  199.  
  200.  
  201. def lemmatise(description_array):
  202. for word in description_array:
  203. result = morf.analyse(word)
  204. if len(word) > 0:
  205. description_array[description_array.index(word)] = result[0][2][1]
  206. else:
  207. description_array.remove(word)
  208.  
  209. return description_array
  210.  
  211.  
  212. def remove_stopwords(stopwords, description_array):
  213. for desc in description_array:
  214. if desc in stopwords:
  215. description_array.remove(desc)
  216. return description_array
  217.  
  218.  
  219. def lemmatise_stopwords(stopwords):
  220. morpheus = morfeusz2.Morfeusz()
  221. for word in stopwords:
  222. result = morpheus.analyse(word)
  223. if len(result) > 0:
  224. stopwords[stopwords.index(word)] = result[0][2][1]
  225. else:
  226. stopwords.remove(word)
  227. stopwords = list(set(stopwords))
  228. stopwords.sort()
  229. print(stopwords)
  230. return stopwords
  231.  
  232.  
  233. def transform_to_tfidf(X):
  234. tfidf = TfidfVectorizer(
  235. analyzer='word',
  236. token_pattern=None,
  237. tokenizer=dummy_fun,
  238. preprocessor=dummy_fun,
  239. vocabulary=get_tfidf_vocabulary(X, 1000))
  240. tfidf.fit(X)
  241. X = tfidf.transform(X)
  242. return X
  243.  
  244.  
  245. def get_tfidf_vocabulary(X, max_features):
  246. filename = "vocabulary.npy"
  247. filename = "".join([str(max_features), "_", filename])
  248. path = "/".join(['data', filename])
  249. if os.path.isfile(path):
  250. return np.load(path)
  251. tfidf = TfidfVectorizer(
  252. analyzer='word',
  253. tokenizer=dummy_fun,
  254. preprocessor=dummy_fun,
  255. token_pattern=None)
  256. tfidf.fit(X)
  257. sorted_vocabulary = sorted(tfidf.vocabulary_.items(), key=operator.itemgetter(1), reverse=True)
  258. vocabulary = []
  259. i = 0
  260. for k, v in sorted_vocabulary:
  261. vocabulary.append(str(k))
  262. i = i + 1
  263. if i >= max_features:
  264. break
  265. np.save(path, vocabulary)
  266. return vocabulary
  267.  
  268.  
  269. if __name__ == "__main__":
  270. start_time = time.time()
  271. main()
  272. print("Czas: ", time.time() - start_time)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement