Advertisement
Guest User

Untitled

a guest
Jul 17th, 2019
117
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.26 KB | None | 0 0
  1. class MeanEmbeddingVectorizer(object):
  2.  
  3. def __init__(self, word_model):
  4. self.word_model = word_model
  5. self.vector_size = word_model.wv.vector_size
  6.  
  7. def fit(self): # comply with scikit-learn transformer requirement
  8. return self
  9.  
  10. def transform(self, docs): # comply with scikit-learn transformer requirement
  11. doc_word_vector = self.word_average_list(docs)
  12. return doc_word_vector
  13.  
  14. def word_average(self, sent):
  15. """
  16. Compute average word vector for a single doc/sentence.
  17.  
  18.  
  19. :param sent: list of sentence tokens
  20. :return:
  21. mean: float of averaging word vectors
  22. """
  23. mean = []
  24. for word in sent:
  25. if word in self.word_model.wv.vocab:
  26. mean.append(self.word_model.wv.get_vector(word))
  27.  
  28. if not mean: # empty words
  29. # If a text is empty, return a vector of zeros.
  30. logging.warning("cannot compute average owing to no vector for {}".format(sent))
  31. return np.zeros(self.vector_size)
  32. else:
  33. mean = np.array(mean).mean(axis=0)
  34. return mean
  35.  
  36.  
  37. def word_average_list(self, docs):
  38. """
  39. Compute average word vector for multiple docs, where docs had been tokenized.
  40.  
  41. :param docs: list of sentence in list of separated tokens
  42. :return:
  43. array of average word vector in shape (len(docs),)
  44. """
  45. return np.vstack([self.word_average(sent) for sent in docs])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement