Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def clean_text(text):
- text = text.translate(string.punctuation)
- text = text.lower().split()
- stops = set(stopwords.words("english"))
- text = [w for w in text if not w in stops]
- text = " ".join(text)
- text = re.sub(r"[^ws]", " ",text)
- text = re.sub(r"[^A-Za-z0-9^,!./'+-=]", " ",text)
- text = text.split()
- lemmatizer = WordNetLemmatizer()
- lemmatized_words = [lemmatizer.lemmatize(w) for w in text]
- text = " ".join(lemmatized_words)
- return text
- data['text'] = data['text'].map(lambda x: clean_text(x))
- def build_corpus(data):
- "Creates a list of lists containing words from each sentence"
- corpus = []
- for col in ['text']:
- for sentence in data[col].iteritems():
- word_list = sentence[1].split(" ")
- corpus.append(word_list)
- return corpus
- corpus = build_corpus(data)
- from gensim.models import word2vec
- model = word2vec.Word2Vec(corpus, size=100, window=20, min_count=20, workers=12, sg=1)
- words = list(model.wv.vocab)
- tokenizer = Tokenizer()
- X = corpus
- tokenizer.fit_on_texts(X)
- sequences = tokenizer.texts_to_sequences(X)
- X = pad_sequences(sequences, maxlen=10000)
- embedding_vector_size=100
- vocab_size = len(tokenizer.word_index) + 1
- embedding_matrix = np.zeros((vocab_size, embedding_vector_size))
- for word, i in tokenizer.word_index.items():
- embedding_vector = model.wv[word]
- if embedding_vector is not None:
- embedding_matrix[i] = embedding_vector
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement