Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from html.parser import HTMLParser
- import urllib.request as urllib2
- import re
- import locale
- locale.setlocale(locale.LC_ALL, ('pl','utf8'));
- class MyHTMLParser(HTMLParser):
- #Initializing lists
- lsStartTags = list()
- lsEndTags = list()
- lsStartEndTags = list()
- lsComments = list()
- #HTML Parser Methods
- def handle_starttag(self, startTag, attrs):
- self.lsStartTags.append(startTag)
- def handle_endtag(self, endTag):
- self.lsEndTags.append(endTag)
- def handle_startendtag(self,startendTag, attrs):
- self.lsStartEndTags.append(startendTag)
- def handle_comment(self,data):
- self.lsComments.append(data)
- #creating an object of the overridden class
- parser = MyHTMLParser()
- #Opening NYTimes site using urllib2
- html_page = html_page = urllib2.urlopen("http://www.sjp.pl/dialektyka")
- #Feeding the content
- regex = re.findall('<p style="margin:.*?;?">(?:.|[żźćńółęąśŻŹĆĄŚĘŁÓŃ])*?(?<=</p>)',str(html_page. read()));
- #regex=[regex.replace("<br />",'\n') for w in regex];
- print(regex);
- #print("Comments", parser.lsComments)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement