ngram.py

#!/usr/bin/python2
# coding: utf-8

import urllib2
import re
from converter import unaccentedMap

class Text:
    alphabet = '0123456789abcdefghijklmnopqrstuvwxyz '

    def __init__(self, source):
        self.data = unicode(urllib2.urlopen(source).read(), 'utf-8')
    def getData(self):
        return self.data
    def process(self):
        s = self.data.translate(unaccentedMap())
        s = s.replace('\n', '')
        #self.data = re.sub(r'&.*;', r'', self.data)
        p = re.compile('\{\{*?.*?\}\}', re.DOTALL)
        self.data = p.sub('', s)
        #self.data = re.sub(r'', r'', self.data)

if __name__ == '__main__':
    article = Text('http://pl.wikipedia.org/w/api.php?action=query&prop=revisions&rvsection=0&format=xml&rvprop=content&titles=Polska')
    article.process()
    print article.getData()