Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- # coding: utf-8
- import urllib
- from converter import unaccentedMap
- from sgmllib import SGMLParser
- down = urllib.urlopen('http://pl.wikipedia.org/wiki/Polska')
- strona = unicode(down.read(), 'utf-8')
- down.close()
- strona = strona.translate(unaccentedMap())
- #print strona
- class wikiParser(SGMLParser):
- def __init__(self):
- self.f = 0
- def reset(self):
- self.pieces = []
- SGMLParser.reset(self)
- def start_p(self):
- self.f += 1
- def end_p(self):
- self.f -= 1
- def unknown_starttag(self, tag, attr):
- self.f -= 1
- def unknown_endtag(self, tag):
- self.f += 1
- def handle_data(self, text):
- if self.f > 0:
- self.pieces.append(text)
- def output(self):
- return ''.join(self.pieces)
- parser = wikiParser()
- parser.feed(str(strona))
- parser.close()
- print parser.output()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement