Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json
- import nltk
- from nltk.tokenize import RegexpTokenizer
- from stop_words import get_stop_words
- from nltk.stem.wordnet import WordNetLemmatizer
- from gensim import corpora, models
- import gensim
- def main():
- t=open('TED_dataset/ted_talks-10-Sep-2012.json')
- talks=json.loads(t.read())
- i=1;
- tag_corpus=[]
- desc_corpus=[]
- for j in talks:
- if (i == 2):
- break
- else:
- desc_corpus.append(j['description'][0] + (j['transcript'])) #
- i=i+1
- for t in j['related_tags']:
- tag_corpus.append(t)
- # print len(tag_corpus)
- print "Data Fetched"
- tag_corpus=list(set(tag_corpus))# Removing duplicate tags
- tag_corpus=[x.lower() for x in tag_corpus]
- tag_corpus=lemmatize(tag_corpus)
- tag_corpus=list(set(tag_corpus))# Removing duplicate tags
- print "Passing Control to LDAMatrix"
- LDAMatrix(desc_corpus, tag_corpus)
- def lemmatize(tag_corpus):
- parameters=[]
- lmtzr = WordNetLemmatizer()
- for tag in tag_corpus:
- word=lmtzr.lemmatize(tag)
- parameters.append(word)
- return parameters
- def LDAMatrix(desc_corpus, tag_corpus):
- tokenizer = RegexpTokenizer(r'\w+')
- print "Tag Corpus:\n"
- print tag_corpus
- print "_____________________________"
- # create English stop words list
- en_stop = get_stop_words('en')
- for i in desc_corpus:
- texts = []
- tokens=[]
- stopped_tokens=[]
- stopped_tokens=[]
- raw = i.lower()
- print len(raw)
- tokens = tokenizer.tokenize(raw) # clean and tokenize document string
- stopped_tokens = [i for i in tokens if not i in en_stop] # remove stop words from tokens
- print len(stopped_tokens)
- stemmed_tokens = lemmatize(stopped_tokens)
- texts.append(stemmed_tokens) # add tokens to list
- dictionary = corpora.Dictionary(texts)# turn our tokenized documents into a id <-> term dictionary
- #print dictionary
- corpus = [dictionary.doc2bow(text) for text in texts] #convert tokenized documents into a document-term matrix model
- print corpus
- ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=500, id2word = dictionary, passes=300)# generate LDA model
- LDA_Row=ldamodel.print_topics(num_topics=1, num_words=500)
- lmtzr = WordNetLemmatizer()
- print "Data Parsed and Split"
- for l in LDA_Row[0].split('+'):
- w=str(l.split('*')[1])
- word=lmtzr.lemmatize(w)
- wordv=lmtzr.lemmatize(w,'v')
- print wordv, word
- # if word is not wordv:
- # print word, wordv
- #print "Nothing to Add"
- print "________________________________________________"
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement