Untitled

import gensim
from gensim import corpora, similarities, models
import os


"""
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]

# remove words that appear only once
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once]
         for text in texts]
"""
final_text = []
for fname in os.listdir("/home/ayush/MajorProject/hashtagspace/filtered_unique"):
        for line in open(os.path.join("/home/ayush/MajorProject/hashtagspace/filtered_unique", fname)):
                final_text.append(line.split())

print final_text[0]

"""
dictionary = corpora.Dictionary(texts)
dictionary.save('questions.dict');
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('questions.mm', corpus)

mm = corpora.MmCorpus('questions.mm')
lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=dictionary, num_topics=100, update_every=0, chunksize=19188, passes=20)

print lda.print_topics
"""