Advertisement
lewapkon

ngram.py

Feb 21st, 2014
211
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.76 KB | None | 0 0
  1. #!/usr/bin/python2
  2. # coding: utf-8
  3. #
  4. # authors:
  5. # Paweł Koniarski
  6. # Filip Konieczny
  7.  
  8. '''
  9. changelog:
  10. - generating wiki URL
  11. - deleting multiple spaces in __parseSource
  12. - saving ngrams as a Counter (because important are the numbers of specific ngrams not the order)
  13. '''
  14.  
  15. import re
  16. from urllib import urlopen
  17. from contextlib import closing
  18. from HTMLParser import HTMLParser
  19. from converter import unaccentedMap
  20. from collections import Counter
  21.  
  22. class Ngram(object):
  23.     alphabet = 'abcdefghijklmnopqrstuvwxyz '
  24.  
  25.     def __init__(self, name):
  26.         self.__loadSource(self.__generateURL(name))
  27.         self.__sourceToAscii()
  28.         self.__parseSource()
  29.         self.__createNgrams()
  30.  
  31.     def __generateURL(self, name):
  32.         ''' returns wiki URL from given article name '''
  33.         return 'http://pl.wikipedia.org/wiki/' + name.lower().replace(' ', '_')
  34.  
  35.     def __loadSource(self, url):
  36.         ''' downloads html from 'url' '''
  37.         with closing(urlopen(url)) as x:
  38.             self.source = x.read()
  39.  
  40.     def __sourceToAscii(self):
  41.         ''' converts the source to ascii coding with special
  42.            characters changed to their simpler ascii equivalents '''
  43.         self.source = unicode(self.source, 'utf-8').translate(unaccentedMap())
  44.         self.source = HTMLParser().unescape(self.source)
  45.         self.source = self.source.encode('ascii', 'ignore')
  46.  
  47.     def __parseSource(self):
  48.         ''' parses the source to get plain text '''
  49.         # searches for paragraphs in source and joins them into string
  50.         text = ''.join(re.compile(r'<p.*?>(.*?)</p>').findall(self.source))
  51.         # removes any tags like <a href..> in paragraphs
  52.         text = re.sub('<[^>]*>', '', text)
  53.         # removes notes numbers like [5] in text
  54.         text = re.sub(r'\[.*?\]', '', text)
  55.  
  56.         text = text.lower()
  57.         # removes any characters from text which are not part of the alphabet
  58.         l = []
  59.         for i in text:
  60.             l.append(i) if i in self.alphabet else l.append(' ')
  61.         text = ''.join(l)
  62.  
  63.         # deletes all multiple spaces
  64.         self.text = re.sub('  +', ' ', text)
  65.  
  66.     def __createNgrams(self):
  67.         ''' creates trigrams from earlier parsed text '''
  68.         i = 0
  69.         ngrams = []
  70.         length = len(self.text) - 2
  71.         while i < length:
  72.             ngrams.append(''.join(self.text[i:i+3]))
  73.             i += 1
  74.         self.ngrams = Counter(ngrams)
  75.  
  76.     def getNgrams(self):
  77.         return self.ngrams
  78.  
  79.     def save_text2file(self, name, ext):
  80.         with open('%(name)s.%(ext)s' %locals(), 'w') as f:
  81.             f.write(self.text)
  82.  
  83. if __name__ == '__main__':
  84.     article = Ngram('młot pneumatyczny')
  85.     #print polska.getNgrams()
  86.     #polska.save_text2file('tekstZwiki', 'html')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement