ngram.py

#!/usr/bin/python2
# coding: utf-8

import re
import urllib
import contextlib
from HTMLParser import HTMLParser
from converter import unaccentedMap

class ngram(object):
    alphabet = 'abcdefghijklmnopqrstuvwxyz '

    def __init__(self, url):
        self.__loadSource(url)
        self.__sourceToAscii()
        self.__parseSource()
        self.__createNgrams()

    def __loadSource(self, url):
        ''' downloads html from 'url' '''
        with contextlib.closing(urllib.urlopen(url)) as x:
            self.source = x.read()

    def __sourceToAscii(self):
        ''' converts the source to ascii coding with special
            characters changed to their simpler ascii equivalents '''
        self.source = unicode(self.source, 'utf-8').translate(unaccentedMap())
        self.source = HTMLParser().unescape(self.source)
        self.source = self.source.encode('ascii', 'ignore')

    def __parseSource(self):
        ''' parses the source to get plain text '''
        # searches for paragraphs in source and joins them into string
        text = ''.join(re.compile(r'<p.*?>(.*?)</p>').findall(self.source))
        # removes any tags like <a href..> in paragraphs
        text = re.sub('<[^>]*>', '', text)
        # removes notes numbers like [5] in text
        text = re.sub(r'\[.*?\]', '', text)

        text = text.lower()
        # removes any characters from text which are not part of the alphabet
        l = []
        for i in text:
            l.append(i) if i in self.alphabet else l.append(' ')
        self.text = ''.join(l)

    def __createNgrams(self):
        ''' creates trigrams from earlier parsed text '''
        i = 0
        self.ngrams = []
        length = len(self.text) - 2
        while i < length:
            self.ngrams.append(''.join(self.text[i:i+3]))
            i += 1

    def __ngramsCoding(self):
        for i in xrange(len(self.alphabet)):
            mapping[self.alphabet[i]] = i
        for i in xrange(len())
        ''' ... '''

    def getNgrams(self):
        return self.ngrams

    def save_text2file(self, name, ext):
        with open('%(name)s.%(ext)s' %locals(), 'w') as f:
            f.write(self.text)


polska = ngram('http://pl.wikipedia.org/wiki/Polska')
#print polska.getNgrams()
#polska.save_text2file('tekstZwiki', 'html')