Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python2
- # coding: utf-8
- import urllib2
- import re
- from converter import unaccentedMap
- class Text:
- alphabet = '0123456789abcdefghijklmnopqrstuvwxyz '
- def __init__(self, source):
- self.data = unicode(urllib2.urlopen(source).read(), 'utf-8')
- def getData(self):
- return self.data
- def process(self):
- self.data = self.data.translate(unaccentedMap())
- self.data = self.data.replace('\n', '')
- def unescape(self):
- self.data = self.data.replace("<", "<")
- self.data = self.data.replace(">", ">")
- # this has to be last:
- self.data = self.data.replace("&", "&")
- def rmTrash(self):
- k = n = i = 0
- text = []
- text2 = []
- while i < len(self.data):
- if self.data[i] == '{':
- k += 1
- if k == 0:
- if self.data[i] == '[' and self.data[i+1] == '[':
- n += 1
- i += 2
- if self.data[i] == ']' and self.data[i+1] == ']':
- n -= 1
- i += 2
- text.extend(text2)
- if n == 1:
- text2.append(self.data[i])
- if self.data[i] == '|':
- text2 = []
- if k == 0 and n == 0:
- text.append(self.data[i])
- if self.data[i] == '}':
- k -= 1
- i += 1
- self.data = ''.join(text)
- if __name__ == '__main__':
- article = Text('http://pl.wikipedia.org/w/api.php?action=query&prop=revisions&rvsection=0&format=xml&rvprop=content&titles=Polska')
- article.process()
- article.unescape()
- article.rmTrash()
- #with open('bla.txt', 'w') as f:
- # f.write(article.getData())
- print article.getData()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement