ngram2.py

#!/usr/bin/python2
# coding: utf-8

import urllib2
import re
from converter import unaccentedMap

class Text:
    alphabet = '0123456789abcdefghijklmnopqrstuvwxyz '

    def __init__(self, source):
        self.data = unicode(urllib2.urlopen(source).read(), 'utf-8')
    def getData(self):
        return self.data
    def process(self):
        self.data = self.data.translate(unaccentedMap())
        self.data = self.data.replace('\n', '')
    def unescape(self):
        self.data = self.data.replace("&lt;", "<")
        self.data = self.data.replace("&gt;", ">")
        # this has to be last:
        self.data = self.data.replace("&amp;", "&")
    def rmTrash(self):
    k = n = i = 0
    text = []
    text2 = []
    while i < len(self.data):
        if self.data[i] == '{':
        k += 1
        if k == 0:
        if self.data[i] == '[' and self.data[i+1] == '[':
            n += 1
            i += 2
            if self.data[i] == ']' and self.data[i+1] == ']':
            n -= 1
            i += 2
            text.extend(text2)
        if n == 1:
        text2.append(self.data[i])
        if self.data[i] == '|':
            text2 = []

        if k == 0 and n == 0:
        text.append(self.data[i])
        if self.data[i] == '}':
        k -= 1
        i += 1
    self.data =  ''.join(text)
if __name__ == '__main__':
    article = Text('http://pl.wikipedia.org/w/api.php?action=query&prop=revisions&rvsection=0&format=xml&rvprop=content&titles=Polska')
    article.process()
    article.unescape()
    article.rmTrash()
    #with open('bla.txt', 'w') as f:
#   f.write(article.getData())
    print article.getData()