#!/usr/bin/python2 # coding: utf-8 # # authors: # Paweł Koniarski # Filip Konieczny ''' changelog: - generating wiki URL - deleting multiple spaces in __parseSource - saving ngrams as a Counter (because important are the numbers of specific ngrams not the order) ''' import re from urllib import urlopen from contextlib import closing from HTMLParser import HTMLParser from converter import unaccentedMap from collections import Counter class Ngram(object): alphabet = 'abcdefghijklmnopqrstuvwxyz ' def __init__(self, name): self.__loadSource(self.__generateURL(name)) self.__sourceToAscii() self.__parseSource() self.__createNgrams() def __generateURL(self, name): ''' returns wiki URL from given article name ''' return 'http://pl.wikipedia.org/wiki/' + name.lower().replace(' ', '_') def __loadSource(self, url): ''' downloads html from 'url' ''' with closing(urlopen(url)) as x: self.source = x.read() def __sourceToAscii(self): ''' converts the source to ascii coding with special characters changed to their simpler ascii equivalents ''' self.source = unicode(self.source, 'utf-8').translate(unaccentedMap()) self.source = HTMLParser().unescape(self.source) self.source = self.source.encode('ascii', 'ignore') def __parseSource(self): ''' parses the source to get plain text ''' # searches for paragraphs in source and joins them into string text = ''.join(re.compile(r'