Advertisement
lewapkon

ngram2.py

Feb 20th, 2014
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.55 KB | None | 0 0
  1. #!/usr/bin/python2
  2. # coding: utf-8
  3.  
  4. import urllib2
  5. import re
  6. from converter import unaccentedMap
  7.  
  8. class Text:
  9.     alphabet = '0123456789abcdefghijklmnopqrstuvwxyz '
  10.  
  11.     def __init__(self, source):
  12.         self.data = unicode(urllib2.urlopen(source).read(), 'utf-8')
  13.     def getData(self):
  14.         return self.data
  15.     def process(self):
  16.         self.data = self.data.translate(unaccentedMap())
  17.         self.data = self.data.replace('\n', '')
  18.     def unescape(self):
  19.         self.data = self.data.replace("&lt;", "<")
  20.         self.data = self.data.replace("&gt;", ">")
  21.         # this has to be last:
  22.         self.data = self.data.replace("&amp;", "&")
  23.     def rmTrash(self):
  24.     k = n = i = 0
  25.     text = []
  26.     text2 = []
  27.     while i < len(self.data):
  28.         if self.data[i] == '{':
  29.         k += 1
  30.         if k == 0:
  31.         if self.data[i] == '[' and self.data[i+1] == '[':
  32.             n += 1
  33.             i += 2
  34.             if self.data[i] == ']' and self.data[i+1] == ']':  
  35.             n -= 1
  36.             i += 2
  37.             text.extend(text2)
  38.         if n == 1:
  39.         text2.append(self.data[i])
  40.         if self.data[i] == '|':
  41.             text2 = []
  42.        
  43.         if k == 0 and n == 0:
  44.         text.append(self.data[i])
  45.         if self.data[i] == '}':
  46.         k -= 1
  47.         i += 1
  48.     self.data =  ''.join(text)
  49. if __name__ == '__main__':
  50.     article = Text('http://pl.wikipedia.org/w/api.php?action=query&prop=revisions&rvsection=0&format=xml&rvprop=content&titles=Polska')
  51.     article.process()
  52.     article.unescape()
  53.     article.rmTrash()
  54.     #with open('bla.txt', 'w') as f:
  55. #   f.write(article.getData())
  56.     print article.getData()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement