Advertisement
lewapkon

ngram3.py

Feb 20th, 2014
57
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.82 KB | None | 0 0
  1. #!/usr/bin/python
  2. # coding: utf-8
  3.  
  4. import urllib
  5. from converter import unaccentedMap
  6. from sgmllib import SGMLParser
  7.  
  8. down = urllib.urlopen('http://pl.wikipedia.org/wiki/Polska')
  9. strona = unicode(down.read(), 'utf-8')
  10. down.close()
  11.  
  12. strona = strona.translate(unaccentedMap())
  13. #print strona
  14.  
  15. class wikiParser(SGMLParser):
  16.     def __init__(self):
  17.     self.f = 0
  18.     def reset(self):
  19.     self.pieces = []
  20.     SGMLParser.reset(self)
  21.     def start_p(self):
  22.     self.f += 1
  23.     def end_p(self):
  24.     self.f -= 1
  25.     def unknown_starttag(self, tag, attr):
  26.     self.f -= 1
  27.     def unknown_endtag(self, tag):
  28.     self.f += 1
  29.     def handle_data(self, text):
  30.     if self.f > 0:
  31.         self.pieces.append(text)
  32.     def output(self):
  33.     return ''.join(self.pieces)
  34.  
  35. parser = wikiParser()
  36. parser.feed(str(strona))
  37. parser.close()
  38. print parser.output()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement