Advertisement
Guest User

Słownik

a guest
Dec 17th, 2018
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.14 KB | None | 0 0
  1.  from html.parser import HTMLParser
  2.  import urllib.request as urllib2
  3.  import re
  4.  import locale
  5.  
  6.  locale.setlocale(locale.LC_ALL, ('pl','utf8'));
  7.  
  8.  class MyHTMLParser(HTMLParser):
  9.  
  10.     #Initializing lists
  11.     lsStartTags = list()
  12.     lsEndTags = list()
  13.     lsStartEndTags = list()
  14.     lsComments = list()
  15.  
  16.     #HTML Parser Methods
  17.     def handle_starttag(self, startTag, attrs):
  18.         self.lsStartTags.append(startTag)
  19.  
  20.     def handle_endtag(self, endTag):
  21.         self.lsEndTags.append(endTag)
  22.  
  23.     def handle_startendtag(self,startendTag, attrs):
  24.         self.lsStartEndTags.append(startendTag)
  25.  
  26.     def handle_comment(self,data):
  27.         self.lsComments.append(data)
  28.  
  29.  #creating an object of the overridden class
  30.  parser = MyHTMLParser()
  31.  
  32.  #Opening NYTimes site using urllib2
  33.  html_page = html_page = urllib2.urlopen("http://www.sjp.pl/dialektyka")
  34.  
  35.  #Feeding the content
  36.  regex = re.findall('<p style="margin:.*?;?">(?:.|[żźćńółęąśŻŹĆĄŚĘŁÓŃ])*?(?<=</p>)',str(html_page.         read()));
  37.  #regex=[regex.replace("<br />",'\n') for w in regex];
  38.  print(regex);
  39.  #print("Comments", parser.lsComments)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement