Advertisement
Guest User

Untitled

a guest
Oct 31st, 2014
138
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.17 KB | None | 0 0
  1. import gensim
  2. from gensim import corpora, similarities, models
  3. import os
  4.  
  5.  
  6. """
  7. # remove common words and tokenize
  8. stoplist = set('for a of the and to in'.split())
  9. texts = [[word for word in document.lower().split() if word not in stoplist]
  10. for document in documents]
  11.  
  12. # remove words that appear only once
  13. all_tokens = sum(texts, [])
  14. tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
  15. texts = [[word for word in text if word not in tokens_once]
  16. for text in texts]
  17. """
  18. final_text = []
  19. for fname in os.listdir("/home/ayush/MajorProject/hashtagspace/filtered_unique"):
  20. for line in open(os.path.join("/home/ayush/MajorProject/hashtagspace/filtered_unique", fname)):
  21. final_text.append(line.split())
  22.  
  23. print final_text[0]
  24.  
  25. """
  26. dictionary = corpora.Dictionary(texts)
  27. dictionary.save('questions.dict');
  28. corpus = [dictionary.doc2bow(text) for text in texts]
  29. corpora.MmCorpus.serialize('questions.mm', corpus)
  30.  
  31. mm = corpora.MmCorpus('questions.mm')
  32. lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=dictionary, num_topics=100, update_every=0, chunksize=19188, passes=20)
  33.  
  34. print lda.print_topics
  35. """
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement