Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python2
- # coding: utf-8
- import re
- import urllib
- import contextlib
- from HTMLParser import HTMLParser
- from converter import unaccentedMap
- class ngram(object):
- alphabet = 'abcdefghijklmnopqrstuvwxyz '
- def __init__(self, url):
- self.__loadSource(url)
- self.__sourceToAscii()
- self.__parseSource()
- self.__createNgrams()
- def __loadSource(self, url):
- ''' downloads html from 'url' '''
- with contextlib.closing(urllib.urlopen(url)) as x:
- self.source = x.read()
- def __sourceToAscii(self):
- ''' converts the source to ascii coding with special
- characters changed to their simpler ascii equivalents '''
- self.source = unicode(self.source, 'utf-8').translate(unaccentedMap())
- self.source = HTMLParser().unescape(self.source)
- self.source = self.source.encode('ascii', 'ignore')
- def __parseSource(self):
- ''' parses the source to get plain text '''
- # searches for paragraphs in source and joins them into string
- text = ''.join(re.compile(r'<p.*?>(.*?)</p>').findall(self.source))
- # removes any tags like <a href..> in paragraphs
- text = re.sub('<[^>]*>', '', text)
- # removes notes numbers like [5] in text
- text = re.sub(r'\[.*?\]', '', text)
- text = text.lower()
- # removes any characters from text which are not part of the alphabet
- l = []
- for i in text:
- l.append(i) if i in self.alphabet else l.append(' ')
- self.text = ''.join(l)
- def __createNgrams(self):
- ''' creates trigrams from earlier parsed text '''
- i = 0
- self.ngrams = []
- length = len(self.text) - 2
- while i < length:
- self.ngrams.append(''.join(self.text[i:i+3]))
- i += 1
- def __ngramsCoding(self):
- for i in xrange(len(self.alphabet)):
- mapping[self.alphabet[i]] = i
- for i in xrange(len())
- ''' ... '''
- def getNgrams(self):
- return self.ngrams
- def save_text2file(self, name, ext):
- with open('%(name)s.%(ext)s' %locals(), 'w') as f:
- f.write(self.text)
- polska = ngram('http://pl.wikipedia.org/wiki/Polska')
- #print polska.getNgrams()
- #polska.save_text2file('tekstZwiki', 'html')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement