Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup, Tag, NavigableString
- class Article:
- def __init__(self, div: Tag):
- self.div = div
- self.char_scope = 0
- self.header = None
- def get_article_text(self):
- return self.div.get_text()
- def get_header_text(self):
- if self.header is None:
- return ''
- return self.header.get_text()
- @staticmethod
- def rec_header_finder(div: Tag):
- parent = div.parent
- h1 = parent.find('h1')
- if h1 is not None:
- return h1
- return Article.rec_header_finder(parent)
- def find_header(self):
- parent = self.div.parent
- self.header = parent.find('h1')
- if self.header is None:
- self.header = Article.rec_header_finder(parent)
- def find_text(article: Article):
- # TODO Сделать проверку на скрипт
- for item in article.div.contents:
- if type(item) == NavigableString:
- article.char_scope += len(item)
- text_attributes = article.div.find_all(['p', 'b', 'li'], recursive=False)
- for text_attribute in text_attributes:
- for item in text_attribute.contents:
- if type(item) == NavigableString:
- article.char_scope += len(item)
- def find_divs(soup: BeautifulSoup):
- divs = soup.find_all('div')
- return divs
- def sort_by_scope(article: Article):
- return article.char_scope
- if __name__ == '__main__':
- # r = requests.get('https://lenta.ru/news/2018/10/15/bashkirov/').content
- # r = requests.get('https://habr.com/company/ruvds/blog/426413/').content
- # r = requests.get('https://lenta.ru/news/2018/10/15/tolokonnikova/').content
- r = requests.get('https://sobesednik.ru/obshchestvo/20120510-pochemu-zastrelilsya-eks-nachalnik-razvedki-leonid-shebarshin').content
- soup = BeautifulSoup(r, 'html.parser')
- divs = find_divs(soup)
- articles = [Article(div) for div in divs]
- for article in articles:
- find_text(article)
- articles.sort(key=sort_by_scope, reverse=True)
- articles[0].find_header()
- print(articles[0].get_header_text())
- print(articles[0].get_article_text())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement