Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python2
- # coding: utf-8
- #
- # authors:
- # Paweł Koniarski
- # Filip Konieczny
- '''
- changelog:
- - generating wiki URL
- - deleting multiple spaces in __parseSource
- - saving ngrams as a Counter (because important are the numbers of specific ngrams not the order)
- '''
- import re
- from urllib import urlopen
- from contextlib import closing
- from HTMLParser import HTMLParser
- from converter import unaccentedMap
- from collections import Counter
- class Ngram(object):
- alphabet = 'abcdefghijklmnopqrstuvwxyz '
- def __init__(self, name):
- self.__loadSource(self.__generateURL(name))
- self.__sourceToAscii()
- self.__parseSource()
- self.__createNgrams()
- def __generateURL(self, name):
- ''' returns wiki URL from given article name '''
- return 'http://pl.wikipedia.org/wiki/' + name.lower().replace(' ', '_')
- def __loadSource(self, url):
- ''' downloads html from 'url' '''
- with closing(urlopen(url)) as x:
- self.source = x.read()
- def __sourceToAscii(self):
- ''' converts the source to ascii coding with special
- characters changed to their simpler ascii equivalents '''
- self.source = unicode(self.source, 'utf-8').translate(unaccentedMap())
- self.source = HTMLParser().unescape(self.source)
- self.source = self.source.encode('ascii', 'ignore')
- def __parseSource(self):
- ''' parses the source to get plain text '''
- # searches for paragraphs in source and joins them into string
- text = ''.join(re.compile(r'<p.*?>(.*?)</p>').findall(self.source))
- # removes any tags like <a href..> in paragraphs
- text = re.sub('<[^>]*>', '', text)
- # removes notes numbers like [5] in text
- text = re.sub(r'\[.*?\]', '', text)
- text = text.lower()
- # removes any characters from text which are not part of the alphabet
- l = []
- for i in text:
- l.append(i) if i in self.alphabet else l.append(' ')
- text = ''.join(l)
- # deletes all multiple spaces
- self.text = re.sub(' +', ' ', text)
- def __createNgrams(self):
- ''' creates trigrams from earlier parsed text '''
- i = 0
- ngrams = []
- length = len(self.text) - 2
- while i < length:
- ngrams.append(''.join(self.text[i:i+3]))
- i += 1
- self.ngrams = Counter(ngrams)
- def getNgrams(self):
- return self.ngrams
- def save_text2file(self, name, ext):
- with open('%(name)s.%(ext)s' %locals(), 'w') as f:
- f.write(self.text)
- if __name__ == '__main__':
- article = Ngram('młot pneumatyczny')
- #print polska.getNgrams()
- #polska.save_text2file('tekstZwiki', 'html')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement