ngram.py

#!/usr/bin/python2
# coding: utf-8
#
# authors:
# Paweł Koniarski
# Filip Konieczny

'''
changelog:
- generating wiki URL
- deleting multiple spaces in __parseSource
- saving ngrams as a Counter (because important are the numbers of specific ngrams not the order)
'''

import re
from urllib import urlopen
from contextlib import closing
from HTMLParser import HTMLParser
from converter import unaccentedMap
from collections import Counter

class Ngram(object):
    alphabet = 'abcdefghijklmnopqrstuvwxyz '

    def __init__(self, name):
        self.__loadSource(self.__generateURL(name))
        self.__sourceToAscii()
        self.__parseSource()
        self.__createNgrams()

    def __generateURL(self, name):
        ''' returns wiki URL from given article name '''
        return 'http://pl.wikipedia.org/wiki/' + name.lower().replace(' ', '_')

    def __loadSource(self, url):
        ''' downloads html from 'url' '''
        with closing(urlopen(url)) as x:
            self.source = x.read()

    def __sourceToAscii(self):
        ''' converts the source to ascii coding with special
            characters changed to their simpler ascii equivalents '''
        self.source = unicode(self.source, 'utf-8').translate(unaccentedMap())
        self.source = HTMLParser().unescape(self.source)
        self.source = self.source.encode('ascii', 'ignore')

    def __parseSource(self):
        ''' parses the source to get plain text '''
        # searches for paragraphs in source and joins them into string
        text = ''.join(re.compile(r'<p.*?>(.*?)</p>').findall(self.source))
        # removes any tags like <a href..> in paragraphs
        text = re.sub('<[^>]*>', '', text)
        # removes notes numbers like [5] in text
        text = re.sub(r'\[.*?\]', '', text)

        text = text.lower()
        # removes any characters from text which are not part of the alphabet
        l = []
        for i in text:
            l.append(i) if i in self.alphabet else l.append(' ')
        text = ''.join(l)

        # deletes all multiple spaces
        self.text = re.sub('  +', ' ', text)

    def __createNgrams(self):
        ''' creates trigrams from earlier parsed text '''
        i = 0
        ngrams = []
        length = len(self.text) - 2
        while i < length:
            ngrams.append(''.join(self.text[i:i+3]))
            i += 1
        self.ngrams = Counter(ngrams)

    def getNgrams(self):
        return self.ngrams

    def save_text2file(self, name, ext):
        with open('%(name)s.%(ext)s' %locals(), 'w') as f:
            f.write(self.text)

if __name__ == '__main__':
    article = Ngram('młot pneumatyczny')
    #print polska.getNgrams()
    #polska.save_text2file('tekstZwiki', 'html')