Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from urllib2 import Request as request
- import urllib2
- from nltk.tokenize import RegexpTokenizer
- from nltk.corpus import stopwords
- from nltk.stem.porter import PorterStemmer
- from gensim import corpora, models
- from bs4 import BeautifulSoup
- import gensim
- url = "http://www.planejamento.gov.br/noticias/governo-desbloqueia-r-5-bilhoes-do-orcamento-de-2017"
- html = urllib2.urlopen(url).read().decode('utf8')
- tokenizer = RegexpTokenizer(r'\w+')
- pt_stop = stopwords.words('portuguese')
- p_stemmer = PorterStemmer()
- raw = BeautifulSoup(html, "html.parser")
- raw = raw.find(id='content')
- for script in raw(["script", "style"]):
- script.decompose() # retira todos os scripts
- doc_set = [raw.get_text()]
- texts = []
- for token in doc_set:
- tokens = tokenizer.tokenize(token.lower())
- stopped_tokens = [i for i in tokens if not i in pt_stop]
- stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
- texts.append(stemmed_tokens)
- dictionary = corpora.Dictionary(texts)
- corpus = [dictionary.doc2bow(text) for text in texts]
- ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)
- print('Topicos')
- print(ldamodel.print_topics())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement