Advertisement
Guest User

Untitled

a guest
Aug 20th, 2019
102
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.38 KB | None | 0 0
  1. def clean_text(text):
  2.  
  3. text = text.translate(string.punctuation)
  4.  
  5. text = text.lower().split()
  6.  
  7. stops = set(stopwords.words("english"))
  8. text = [w for w in text if not w in stops]
  9.  
  10. text = " ".join(text)
  11. text = re.sub(r"[^ws]", " ",text)
  12. text = re.sub(r"[^A-Za-z0-9^,!./'+-=]", " ",text)
  13.  
  14. text = text.split()
  15. lemmatizer = WordNetLemmatizer()
  16. lemmatized_words = [lemmatizer.lemmatize(w) for w in text]
  17. text = " ".join(lemmatized_words)
  18.  
  19.  
  20. return text
  21.  
  22. data['text'] = data['text'].map(lambda x: clean_text(x))
  23.  
  24. def build_corpus(data):
  25. "Creates a list of lists containing words from each sentence"
  26. corpus = []
  27. for col in ['text']:
  28. for sentence in data[col].iteritems():
  29. word_list = sentence[1].split(" ")
  30. corpus.append(word_list)
  31. return corpus
  32.  
  33. corpus = build_corpus(data)
  34.  
  35. from gensim.models import word2vec
  36. model = word2vec.Word2Vec(corpus, size=100, window=20, min_count=20, workers=12, sg=1)
  37.  
  38. words = list(model.wv.vocab)
  39.  
  40. tokenizer = Tokenizer()
  41. X = corpus
  42. tokenizer.fit_on_texts(X)
  43. sequences = tokenizer.texts_to_sequences(X)
  44. X = pad_sequences(sequences, maxlen=10000)
  45.  
  46. embedding_vector_size=100
  47.  
  48. vocab_size = len(tokenizer.word_index) + 1
  49. embedding_matrix = np.zeros((vocab_size, embedding_vector_size))
  50. for word, i in tokenizer.word_index.items():
  51. embedding_vector = model.wv[word]
  52. if embedding_vector is not None:
  53. embedding_matrix[i] = embedding_vector
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement