Advertisement
Guest User

Untitled

a guest
Oct 15th, 2018
122
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.19 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup, Tag, NavigableString
  3.  
  4.  
  5. class Article:
  6. def __init__(self, div: Tag):
  7. self.div = div
  8. self.char_scope = 0
  9. self.header = None
  10.  
  11. def get_article_text(self):
  12. return self.div.get_text()
  13.  
  14. def get_header_text(self):
  15. if self.header is None:
  16. return ''
  17. return self.header.get_text()
  18.  
  19. @staticmethod
  20. def rec_header_finder(div: Tag):
  21. parent = div.parent
  22. h1 = parent.find('h1')
  23. if h1 is not None:
  24. return h1
  25. return Article.rec_header_finder(parent)
  26.  
  27. def find_header(self):
  28. parent = self.div.parent
  29. self.header = parent.find('h1')
  30. if self.header is None:
  31. self.header = Article.rec_header_finder(parent)
  32.  
  33.  
  34. def find_text(article: Article):
  35. # TODO Сделать проверку на скрипт
  36. for item in article.div.contents:
  37. if type(item) == NavigableString:
  38. article.char_scope += len(item)
  39.  
  40. text_attributes = article.div.find_all(['p', 'b', 'li'], recursive=False)
  41. for text_attribute in text_attributes:
  42. for item in text_attribute.contents:
  43. if type(item) == NavigableString:
  44. article.char_scope += len(item)
  45.  
  46.  
  47. def find_divs(soup: BeautifulSoup):
  48. divs = soup.find_all('div')
  49. return divs
  50.  
  51.  
  52. def sort_by_scope(article: Article):
  53. return article.char_scope
  54.  
  55.  
  56. if __name__ == '__main__':
  57. # r = requests.get('https://lenta.ru/news/2018/10/15/bashkirov/').content
  58. # r = requests.get('https://habr.com/company/ruvds/blog/426413/').content
  59. # r = requests.get('https://lenta.ru/news/2018/10/15/tolokonnikova/').content
  60. r = requests.get('https://sobesednik.ru/obshchestvo/20120510-pochemu-zastrelilsya-eks-nachalnik-razvedki-leonid-shebarshin').content
  61. soup = BeautifulSoup(r, 'html.parser')
  62. divs = find_divs(soup)
  63. articles = [Article(div) for div in divs]
  64. for article in articles:
  65. find_text(article)
  66.  
  67. articles.sort(key=sort_by_scope, reverse=True)
  68. articles[0].find_header()
  69. print(articles[0].get_header_text())
  70. print(articles[0].get_article_text())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement