Advertisement
Guest User

Untitled

a guest
Oct 5th, 2015
703
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.48 KB | None | 0 0
  1. import json
  2. import nltk
  3. from nltk.tokenize import RegexpTokenizer
  4. from stop_words import get_stop_words
  5. from nltk.stem.wordnet import WordNetLemmatizer
  6. from gensim import corpora, models
  7. import gensim
  8.  
  9. def main():
  10.     t=open('TED_dataset/ted_talks-10-Sep-2012.json')
  11.     talks=json.loads(t.read())
  12.     i=1;
  13.     tag_corpus=[]
  14.     desc_corpus=[]
  15.     for j in talks:
  16.         if (i == 2):
  17.             break
  18.         else:
  19.             desc_corpus.append(j['description'][0] + (j['transcript'])) #
  20.             i=i+1
  21.             for t in j['related_tags']:
  22.                 tag_corpus.append(t)               
  23.     # print len(tag_corpus)
  24.     print "Data Fetched"
  25.     tag_corpus=list(set(tag_corpus))# Removing duplicate tags
  26.     tag_corpus=[x.lower() for x in tag_corpus]
  27.     tag_corpus=lemmatize(tag_corpus)
  28.     tag_corpus=list(set(tag_corpus))# Removing duplicate tags
  29.     print "Passing Control to LDAMatrix"   
  30.     LDAMatrix(desc_corpus, tag_corpus)
  31.  
  32. def lemmatize(tag_corpus):
  33.     parameters=[]
  34.     lmtzr = WordNetLemmatizer()
  35.     for tag in tag_corpus:
  36.         word=lmtzr.lemmatize(tag)
  37.         parameters.append(word)
  38.     return parameters
  39.  
  40.  
  41. def LDAMatrix(desc_corpus, tag_corpus):
  42.     tokenizer = RegexpTokenizer(r'\w+')
  43.     print "Tag Corpus:\n"
  44.     print tag_corpus
  45.     print "_____________________________"
  46.     # create English stop words list
  47.     en_stop = get_stop_words('en')
  48.     for i in desc_corpus:
  49.         texts = []
  50.         tokens=[]
  51.         stopped_tokens=[]
  52.         stopped_tokens=[]
  53.         raw = i.lower()
  54.         print len(raw)
  55.         tokens = tokenizer.tokenize(raw) # clean and tokenize document string
  56.         stopped_tokens = [i for i in tokens if not i in en_stop] # remove stop words from tokens
  57.         print len(stopped_tokens)
  58.         stemmed_tokens = lemmatize(stopped_tokens)
  59.         texts.append(stemmed_tokens) # add tokens to list
  60.         dictionary = corpora.Dictionary(texts)# turn our tokenized documents into a id <-> term dictionary
  61.         #print dictionary
  62.         corpus = [dictionary.doc2bow(text) for text in texts] #convert tokenized documents into a document-term matrix model
  63.         print corpus
  64.         ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=500, id2word = dictionary, passes=300)# generate LDA model
  65.         LDA_Row=ldamodel.print_topics(num_topics=1, num_words=500)
  66.         lmtzr = WordNetLemmatizer()
  67.         print "Data Parsed and Split"
  68.         for l in LDA_Row[0].split('+'):
  69.             w=str(l.split('*')[1])
  70.             word=lmtzr.lemmatize(w)
  71.             wordv=lmtzr.lemmatize(w,'v')
  72.             print wordv, word
  73.             # if word is not wordv:
  74.             #   print word, wordv
  75.             #print "Nothing to Add"
  76.         print "________________________________________________"
  77.  
  78.  
  79. if __name__ == '__main__':
  80.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement