Advertisement
Guest User

LDA Gensim

a guest
Jan 10th, 2018
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.21 KB | None | 0 0
  1. from urllib2 import Request as request
  2. import urllib2
  3. from nltk.tokenize import RegexpTokenizer
  4. from nltk.corpus import stopwords
  5. from nltk.stem.porter import PorterStemmer
  6. from gensim import corpora, models
  7. from bs4 import BeautifulSoup
  8. import gensim
  9.  
  10. url = "http://www.planejamento.gov.br/noticias/governo-desbloqueia-r-5-bilhoes-do-orcamento-de-2017"
  11. html = urllib2.urlopen(url).read().decode('utf8')
  12.  
  13. tokenizer = RegexpTokenizer(r'\w+')
  14.  
  15. pt_stop = stopwords.words('portuguese')
  16.  
  17. p_stemmer = PorterStemmer()
  18.  
  19. raw = BeautifulSoup(html, "html.parser")
  20. raw = raw.find(id='content')
  21. for script in raw(["script", "style"]):
  22. script.decompose() # retira todos os scripts
  23.  
  24. doc_set = [raw.get_text()]
  25.  
  26. texts = []
  27.  
  28. for token in doc_set:
  29.  
  30. tokens = tokenizer.tokenize(token.lower())
  31. stopped_tokens = [i for i in tokens if not i in pt_stop]
  32.  
  33. stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
  34. texts.append(stemmed_tokens)
  35.  
  36. dictionary = corpora.Dictionary(texts)
  37. corpus = [dictionary.doc2bow(text) for text in texts]
  38.  
  39. ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)
  40.  
  41. print('Topicos')
  42. print(ldamodel.print_topics())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement