Advertisement
lewapkon

ngram.py

Feb 21st, 2014
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.32 KB | None | 0 0
  1. #!/usr/bin/python2
  2. # coding: utf-8
  3.  
  4. import re
  5. import urllib
  6. import contextlib
  7. from HTMLParser import HTMLParser
  8. from converter import unaccentedMap
  9.  
  10. class ngram(object):
  11.     alphabet = 'abcdefghijklmnopqrstuvwxyz '
  12.  
  13.     def __init__(self, url):
  14.         self.__loadSource(url)
  15.         self.__sourceToAscii()
  16.         self.__parseSource()
  17.         self.__createNgrams()
  18.  
  19.     def __loadSource(self, url):
  20.         ''' downloads html from 'url' '''
  21.         with contextlib.closing(urllib.urlopen(url)) as x:
  22.             self.source = x.read()
  23.  
  24.     def __sourceToAscii(self):
  25.         ''' converts the source to ascii coding with special
  26.            characters changed to their simpler ascii equivalents '''
  27.         self.source = unicode(self.source, 'utf-8').translate(unaccentedMap())
  28.         self.source = HTMLParser().unescape(self.source)
  29.         self.source = self.source.encode('ascii', 'ignore')
  30.  
  31.     def __parseSource(self):
  32.         ''' parses the source to get plain text '''
  33.         # searches for paragraphs in source and joins them into string
  34.         text = ''.join(re.compile(r'<p.*?>(.*?)</p>').findall(self.source))
  35.         # removes any tags like <a href..> in paragraphs
  36.         text = re.sub('<[^>]*>', '', text)
  37.         # removes notes numbers like [5] in text
  38.         text = re.sub(r'\[.*?\]', '', text)
  39.  
  40.         text = text.lower()
  41.         # removes any characters from text which are not part of the alphabet
  42.         l = []
  43.         for i in text:
  44.             l.append(i) if i in self.alphabet else l.append(' ')
  45.         self.text = ''.join(l)
  46.  
  47.     def __createNgrams(self):
  48.         ''' creates trigrams from earlier parsed text '''
  49.         i = 0
  50.         self.ngrams = []
  51.         length = len(self.text) - 2
  52.         while i < length:
  53.             self.ngrams.append(''.join(self.text[i:i+3]))
  54.             i += 1
  55.  
  56.     def __ngramsCoding(self):
  57.         for i in xrange(len(self.alphabet)):
  58.             mapping[self.alphabet[i]] = i
  59.         for i in xrange(len())
  60.         ''' ... '''
  61.  
  62.     def getNgrams(self):
  63.         return self.ngrams
  64.  
  65.     def save_text2file(self, name, ext):
  66.         with open('%(name)s.%(ext)s' %locals(), 'w') as f:
  67.             f.write(self.text)
  68.  
  69.  
  70. polska = ngram('http://pl.wikipedia.org/wiki/Polska')
  71. #print polska.getNgrams()
  72. #polska.save_text2file('tekstZwiki', 'html')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement