Guest User

Untitled

a guest
Jul 22nd, 2018
87
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.42 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from krwordrank.hangle import normalize
  4. from krwordrank.word import KRWordRank
  5. from konlpy.tag import Hannanum
  6.  
  7. def parse(url):
  8. print('\x1b[0;30;47mURL\x1b[0m : ' + url)
  9. return keywords(requests.get(url).text)
  10.  
  11. def parse_all(articles):
  12. keywords = []
  13. print('\x1b[5;30;44m' + '='*30 + '\x1b[0m')
  14. for idx, url in enumerate(articles):
  15. if not idx == 0:
  16. print('\x1b[5;30;44m' + '='*30 + '\x1b[0m')
  17. keywords.append(parse(url))
  18. print('\x1b[5;30;44m' + '='*30 + '\x1b[0m')
  19. return keywords
  20.  
  21. def getsoup(html):
  22. soup = BeautifulSoup(html, 'html.parser')
  23. print('\x1b[0;30;47mTITLE\x1b[0m : ' + soup.title.text)
  24. return soup
  25.  
  26. def clean(soup):
  27. soup = soup.find('body')
  28. [s.decompose() for s in soup(['script', 'style'])]
  29. return ' '.join(soup.stripped_strings)
  30.  
  31. def keywords(html):
  32. texts = ' '.join(clean(getsoup(html)).split()).split('.')
  33. texts = [normalize(text, english=True, number=True) for text in texts]
  34. wordrank_extractor = KRWordRank(min_count = 5, max_length = 10, verbose = True)
  35. beta = 0.85
  36. max_iter = 10
  37. keywords, rank, graph = wordrank_extractor.extract(texts, beta, max_iter)
  38. result = ' '.join([word[0] for word in sorted(keywords.items(), key=lambda x:x[1], reverse=True)[:10]])
  39. result = Hannanum().nouns(result)
  40. print('\x1b[5;30;42mKEYWORDS\x1b[0m : ' + ' '.join(result))
  41. return result
Add Comment
Please, Sign In to add comment