Advertisement
Guest User

Parser

a guest
Feb 10th, 2016
160
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.64 KB | None | 0 0
  1. #!/usr/local/bin/python3
  2. # -*- coding: utf-8 -*-
  3. # simagin.mail@yandex.ru - CS
  4.  
  5. from html.parser import HTMLParser
  6. from urllib.request import quote, urlopen
  7.  
  8.  
  9. class Parser(HTMLParser):
  10.     """Helper class for retrieving header."""
  11.  
  12.     def __init__(self):
  13.         HTMLParser.__init__(self)
  14.         self.in_head_tag = False
  15.         self.in_span_tag = False
  16.         self.head_text = ''
  17.  
  18.     def handle_starttag(self, tag, attrs):
  19.         if tag == 'h1':
  20.             attrs = dict(attrs)
  21.             if 'id' in attrs and attrs['id'] == 'firstHeading':
  22.                 self.in_head_tag = True
  23.         if self.in_head_tag and tag == 'span':
  24.             self.in_span_tag = True
  25.  
  26.     def handle_endtag(self, tag):
  27.         if self.in_head_tag and tag == 'h1':
  28.             self.in_head_tag = False
  29.         if self.in_span_tag and tag == 'span':
  30.             self.in_span_tag = False
  31.  
  32.     def handle_data(self, data):
  33.         if self.in_span_tag:
  34.             self.head_text = data
  35.  
  36.     def parse(self, page):
  37.         """Return text of article."""
  38.  
  39.         self.feed(page)
  40.         return self.head_text
  41.  
  42.  
  43. WIKI_ROOT = 'http://wiki.cs.hse.ru/'
  44.  
  45. def load(name):
  46.     """Return text of the article with specified url."""
  47.  
  48.     with urlopen(WIKI_ROOT + quote(name)) as file:
  49.         page = str(file.read(), encoding='utf-8')
  50.     return page
  51.  
  52.  
  53. def get_head(page):
  54.     """Return head of page."""
  55.  
  56.     return Parser().parse(page)
  57.  
  58.  
  59. def main():
  60.     page = load('Рекомендательная_система_(семинар)')
  61.     print('Заголовок: "{head}"'.format(head = get_head(page)))
  62.  
  63.  
  64. if __name__ == '__main__':
  65.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement