Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/local/bin/python3
- # -*- coding: utf-8 -*-
- # simagin.mail@yandex.ru - CS
- from html.parser import HTMLParser
- from urllib.request import quote, urlopen
- class Parser(HTMLParser):
- """Helper class for retrieving header."""
- def __init__(self):
- HTMLParser.__init__(self)
- self.in_head_tag = False
- self.in_span_tag = False
- self.head_text = ''
- def handle_starttag(self, tag, attrs):
- if tag == 'h1':
- attrs = dict(attrs)
- if 'id' in attrs and attrs['id'] == 'firstHeading':
- self.in_head_tag = True
- if self.in_head_tag and tag == 'span':
- self.in_span_tag = True
- def handle_endtag(self, tag):
- if self.in_head_tag and tag == 'h1':
- self.in_head_tag = False
- if self.in_span_tag and tag == 'span':
- self.in_span_tag = False
- def handle_data(self, data):
- if self.in_span_tag:
- self.head_text = data
- def parse(self, page):
- """Return text of article."""
- self.feed(page)
- return self.head_text
- WIKI_ROOT = 'http://wiki.cs.hse.ru/'
- def load(name):
- """Return text of the article with specified url."""
- with urlopen(WIKI_ROOT + quote(name)) as file:
- page = str(file.read(), encoding='utf-8')
- return page
- def get_head(page):
- """Return head of page."""
- return Parser().parse(page)
- def main():
- page = load('Рекомендательная_система_(семинар)')
- print('Заголовок: "{head}"'.format(head = get_head(page)))
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement