Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from revscoring.features import wikitext
- from revscoring.features.modifiers import max, sub
- from revscoring.languages import english
- from revscoring import Feature
- from revscoring.datasources import Datasource, revision_oriented
- from revscoring.dependencies import solve
- from gensim.models.keyedvectors import KeyedVectors
- import numpy as np
- import pdb
- word2vec = None
- VECTORS_DIM = 300
- def load_word2vec(filepath):
- global word2vec
- if word2vec is not None:
- return word2vec
- #word2vec = KeyedVectors.load_word2vec_format(filepath,
- # binary=True, limit=20000)
- print("called here")
- word2vec = np.random.uniform(0, 1, size=300)
- return word2vec
- def get_word_vectors(non_stop_tokens):
- word2vec = load_word2vec('')
- vector = np.zeros((1, VECTORS_DIM))
- words_added = 0
- for tok in non_stop_tokens:
- try:
- vec = word2vec[tok]
- vector += vec
- words_added += 1
- except:
- continue
- return vector/words_added
- word_vectors = Datasource("word_vectors",
- get_word_vectors,
- depends_on=[english.stopwords.revision.datasources.non_stopwords])
- def get_feature_at(idx):
- return Feature(
- "word_vector_{}".format(idx),
- lambda vec: vec[idx],
- depends_on=[word_vectors],
- returns=float
- )
- features = []
- for i in range(0,300):
- features.append(get_feature_at(i))
- vectors = solve(features, cache={revision_oriented.revision.text: 'Hey ther, how are you?'})
Add Comment
Please, Sign In to add comment