Advertisement
lewapkon

ngram.py

Feb 20th, 2014
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.83 KB | None | 0 0
  1. #!/usr/bin/python2
  2. # coding: utf-8
  3.  
  4. import urllib2
  5. import re
  6. from converter import unaccentedMap
  7.  
  8. class Text:
  9.     alphabet = '0123456789abcdefghijklmnopqrstuvwxyz '
  10.  
  11.     def __init__(self, source):
  12.         self.data = unicode(urllib2.urlopen(source).read(), 'utf-8')
  13.     def getData(self):
  14.         return self.data
  15.     def process(self):
  16.         s = self.data.translate(unaccentedMap())
  17.         s = s.replace('\n', '')
  18.         #self.data = re.sub(r'&.*;', r'', self.data)
  19.         p = re.compile('\{\{*?.*?\}\}', re.DOTALL)
  20.         self.data = p.sub('', s)
  21.         #self.data = re.sub(r'', r'', self.data)
  22.  
  23. if __name__ == '__main__':
  24.     article = Text('http://pl.wikipedia.org/w/api.php?action=query&prop=revisions&rvsection=0&format=xml&rvprop=content&titles=Polska')
  25.     article.process()
  26.     print article.getData()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement