Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python2
- # coding: utf-8
- import urllib2
- import re
- from converter import unaccentedMap
- class Text:
- alphabet = '0123456789abcdefghijklmnopqrstuvwxyz '
- def __init__(self, source):
- self.data = unicode(urllib2.urlopen(source).read(), 'utf-8')
- def getData(self):
- return self.data
- def process(self):
- s = self.data.translate(unaccentedMap())
- s = s.replace('\n', '')
- #self.data = re.sub(r'&.*;', r'', self.data)
- p = re.compile('\{\{*?.*?\}\}', re.DOTALL)
- self.data = p.sub('', s)
- #self.data = re.sub(r'', r'', self.data)
- if __name__ == '__main__':
- article = Text('http://pl.wikipedia.org/w/api.php?action=query&prop=revisions&rvsection=0&format=xml&rvprop=content&titles=Polska')
- article.process()
- print article.getData()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement