Advertisement
Guest User

Untitled

a guest
Oct 14th, 2019
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.27 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import spacy
  4. from collections import Counter
  5. import re
  6. import pickle
  7. from rank_bm25 import BM25Plus
  8. from nltk.util import ngrams
  9. import unicodedata
  10. import multiprocessing
  11. from sklearn.metrics.pairwise import cosine_similarity
  12. from loguru import logger
  13. import copy
  14.  
  15. def preprocessing(labels=True, spacy_lang='pt_core_news_sm', train_file='train.csv'):
  16. nlp = spacy.load(spacy_lang)
  17. tokenizer = nlp.Defaults.create_tokenizer(nlp)
  18. df = pd.read_csv(train_file)
  19. token_labels = None
  20. if labels == True:
  21. df = df.groupby('category', as_index=False).agg({'title' : ' '.join })
  22. token_labels = df['category'].values.tolist()
  23. tokens = []
  24. docs = df['title'].values.tolist()
  25. for item in docs:
  26. processed_item = re.sub('[0-9]', '__label_DIGIT__', item.lower())
  27. tmp = tokenizer(processed_item)
  28. tokens.append([str(x) for x in tmp if not (x.is_punct or x.is_stop)])
  29. for idx,token in enumerate(tokens):
  30. token_ngrams = [ ' '.join(list(x)) for x in list(ngrams(token, 2)) ]
  31. tokens[idx] = token + token_ngrams
  32. if labels == True:
  33. return tokens, token_labels
  34. else:
  35. return tokens
  36.  
  37. class Expansion(object):
  38.  
  39. def __init__(self):
  40. self.matrix = None
  41.  
  42. def run(self, matrix):
  43. self.matrix = cosine_similarity(matrix)
  44. self.matrix = np.array(self.matrix)
  45.  
  46. def __call__(self, query_id, k=5):
  47. return np.argsort(self.matrix[query_id])[::-1][1:][:k]
  48.  
  49. def save(self, target):
  50. np.save(target, self.matrix)
  51.  
  52. def load(self, target):
  53. self.matrix = np.load(target)
  54.  
  55. def calc_query(submission):
  56. return bm25.get_top_n(submission, corpus, n=1)[0]
  57.  
  58. def load_vectors(cache=True, name='vectors.npy'):
  59. expansion_matrix = []
  60. try:
  61. if cache == False:
  62. raise Exception(" ")
  63. expansion_matrix = np.load(name)
  64. except:
  65. for item in test_tokens:
  66. expansion_matrix.append(bm25.get_scores(item))
  67. np.save(name, expansion_matrix)
  68. return expansion_matrix
  69.  
  70. def load_expansion(expansion_matrix, cache=True, name='expansion.npy'):
  71. expansion = Expansion()
  72. try:
  73. if cache == False:
  74. raise Exception(" ")
  75. expansion.load(name)
  76. except:
  77. expansion.run(expansion_matrix)
  78. expansion.save(name)
  79. return expansion
  80.  
  81. if __name__ == '__main__':
  82. tokens, labels = preprocessing()
  83. label_dict = { ' '.join(i) : v for i,v in list(zip(tokens, labels)) }
  84. bm25 = BM25Plus(tokens, k1=4.5)
  85. test_tokens = preprocessing(labels=False, train_file='test.csv')
  86.  
  87. corpus = [ ' '.join(x) for x in tokens ]
  88.  
  89. expansion_matrix = load_vectors(cache=False)
  90.  
  91. logger.debug('Calculating cosine similarity matrix.')
  92. expansion = load_expansion(expansion_matrix, cache=False)
  93. logger.debug('Starting queries...')
  94.  
  95. arg_list = []
  96. answers = []
  97. model_output = []
  98. for tokenized_query_idx, tokenized_query in enumerate(test_tokens):
  99. submission = copy.deepcopy(tokenized_query)
  100. first_word_list = ['antena', 'receptor', 'conversor', 'estante', 'comando', 'retentor', 'radiador', 'receiver']
  101. if tokenized_query[0] in first_word_list:
  102. submission = list([tokenized_query[0]])
  103. elif tokenized_query[0] == 'mangueira' and 'radiador' in tokenized_query:
  104. submission = list(filter(lambda x: x != 'radiador', tokenized_query))
  105. elif ( 'escova dente' in tokenized_query ) or ( 'escova dental' in tokenized_query ):
  106. submission = ['escova dental']
  107. else:
  108. expansion_ids = expansion(tokenized_query_idx, k=10)
  109. for idx in expansion_ids:
  110. submission.extend(test_tokens[idx])
  111. answer = bm25.get_top_n(submission, corpus, n=1)[0]
  112. answers.append(answer)
  113. if not tokenized_query_idx % 100:
  114. logger.debug("{0} / {1} ( {2} )".format(tokenized_query_idx, len(test_tokens),tokenized_query_idx / len(test_tokens) ))
  115.  
  116. for answer in answers:
  117. classification = label_dict[answer]
  118. model_output.append(classification)
  119.  
  120. final_df = []
  121. for idx, item in enumerate(model_output):
  122. final_df.append({'id': idx, 'category': item})
  123.  
  124. pd.DataFrame(final_df).to_csv('submission.csv', index=False)
  125. logger.debug("End of queries.")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement