Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import gensim
- from gensim import corpora, similarities, models
- import os
- """
- # remove common words and tokenize
- stoplist = set('for a of the and to in'.split())
- texts = [[word for word in document.lower().split() if word not in stoplist]
- for document in documents]
- # remove words that appear only once
- all_tokens = sum(texts, [])
- tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
- texts = [[word for word in text if word not in tokens_once]
- for text in texts]
- """
- final_text = []
- for fname in os.listdir("/home/ayush/MajorProject/hashtagspace/filtered_unique"):
- for line in open(os.path.join("/home/ayush/MajorProject/hashtagspace/filtered_unique", fname)):
- final_text.append(line.split())
- print final_text[0]
- """
- dictionary = corpora.Dictionary(texts)
- dictionary.save('questions.dict');
- corpus = [dictionary.doc2bow(text) for text in texts]
- corpora.MmCorpus.serialize('questions.mm', corpus)
- mm = corpora.MmCorpus('questions.mm')
- lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=dictionary, num_topics=100, update_every=0, chunksize=19188, passes=20)
- print lda.print_topics
- """
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement