#!/usr/bin/python2 # coding: utf-8 # # authors: # Paweł Koniarski # Filip Konieczny ''' changelog: - generating wiki URL - deleting multiple spaces in __parseSource - saving ngrams as a Counter (because important are the numbers of specific ngrams not the order) ''' import re from urllib import urlopen from contextlib import closing from HTMLParser import HTMLParser from converter import unaccentedMap from collections import Counter class Ngram(object): alphabet = 'abcdefghijklmnopqrstuvwxyz ' def __init__(self, name): self.__loadSource(self.__generateURL(name)) self.__sourceToAscii() self.__parseSource() self.__createNgrams() def __generateURL(self, name): ''' returns wiki URL from given article name ''' return 'http://pl.wikipedia.org/wiki/' + name.lower().replace(' ', '_') def __loadSource(self, url): ''' downloads html from 'url' ''' with closing(urlopen(url)) as x: self.source = x.read() def __sourceToAscii(self): ''' converts the source to ascii coding with special characters changed to their simpler ascii equivalents ''' self.source = unicode(self.source, 'utf-8').translate(unaccentedMap()) self.source = HTMLParser().unescape(self.source) self.source = self.source.encode('ascii', 'ignore') def __parseSource(self): ''' parses the source to get plain text ''' # searches for paragraphs in source and joins them into string text = ''.join(re.compile(r'(.*?)

').findall(self.source)) # removes any tags like in paragraphs text = re.sub('<[^>]*>', '', text) # removes notes numbers like [5] in text text = re.sub(r'\[.*?\]', '', text) text = text.lower() # removes any characters from text which are not part of the alphabet l = [] for i in text: l.append(i) if i in self.alphabet else l.append(' ') text = ''.join(l) # deletes all multiple spaces self.text = re.sub(' +', ' ', text) def __createNgrams(self): ''' creates trigrams from earlier parsed text ''' i = 0 ngrams = [] length = len(self.text) - 2 while i < length: ngrams.append(''.join(self.text[i:i+3])) i += 1 self.ngrams = Counter(ngrams) def getNgrams(self): return self.ngrams def save_text2file(self, name, ext): with open('%(name)s.%(ext)s' %locals(), 'w') as f: f.write(self.text) if __name__ == '__main__': article = Ngram('młot pneumatyczny') #print polska.getNgrams() #polska.save_text2file('tekstZwiki', 'html')