Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- from krwordrank.hangle import normalize
- from krwordrank.word import KRWordRank
- from konlpy.tag import Hannanum
- def parse(url):
- print('\x1b[0;30;47mURL\x1b[0m : ' + url)
- return keywords(requests.get(url).text)
- def parse_all(articles):
- keywords = []
- print('\x1b[5;30;44m' + '='*30 + '\x1b[0m')
- for idx, url in enumerate(articles):
- if not idx == 0:
- print('\x1b[5;30;44m' + '='*30 + '\x1b[0m')
- keywords.append(parse(url))
- print('\x1b[5;30;44m' + '='*30 + '\x1b[0m')
- return keywords
- def getsoup(html):
- soup = BeautifulSoup(html, 'html.parser')
- print('\x1b[0;30;47mTITLE\x1b[0m : ' + soup.title.text)
- return soup
- def clean(soup):
- soup = soup.find('body')
- [s.decompose() for s in soup(['script', 'style'])]
- return ' '.join(soup.stripped_strings)
- def keywords(html):
- texts = ' '.join(clean(getsoup(html)).split()).split('.')
- texts = [normalize(text, english=True, number=True) for text in texts]
- wordrank_extractor = KRWordRank(min_count = 5, max_length = 10, verbose = True)
- beta = 0.85
- max_iter = 10
- keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)
- result = ' '.join([word[0] for word in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:10]])
- result = Hannanum().nouns(result)
- print('\x1b[5;30;42mKEYWORDS\x1b[0m : ' + ' '.join(result))
- return result
Add Comment
Please, Sign In to add comment