Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class MeanEmbeddingVectorizer(object):
- def __init__(self, word_model):
- self.word_model = word_model
- self.vector_size = word_model.wv.vector_size
- def fit(self): # comply with scikit-learn transformer requirement
- return self
- def transform(self, docs): # comply with scikit-learn transformer requirement
- doc_word_vector = self.word_average_list(docs)
- return doc_word_vector
- def word_average(self, sent):
- """
- Compute average word vector for a single doc/sentence.
- :param sent: list of sentence tokens
- :return:
- mean: float of averaging word vectors
- """
- mean = []
- for word in sent:
- if word in self.word_model.wv.vocab:
- mean.append(self.word_model.wv.get_vector(word))
- if not mean: # empty words
- # If a text is empty, return a vector of zeros.
- logging.warning("cannot compute average owing to no vector for {}".format(sent))
- return np.zeros(self.vector_size)
- else:
- mean = np.array(mean).mean(axis=0)
- return mean
- def word_average_list(self, docs):
- """
- Compute average word vector for multiple docs, where docs had been tokenized.
- :param docs: list of sentence in list of separated tokens
- :return:
- array of average word vector in shape (len(docs),)
- """
- return np.vstack([self.word_average(sent) for sent in docs])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement