Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import re
- import yaml
- import urllib
- import urllib2
- class WikipediaError(Exception):
- pass
- class Wikipedia:
- url_article = 'http://%s.wikipedia.org/w/index.php?action=raw&title=%s'
- url_image = 'http://%s.wikipedia.org/w/index.php?title=Special:FilePath&file=%s'
- url_search = 'http://%s.wikipedia.org/w/api.php?action=query&list=search&srsearch=%s&sroffset=%d&srlimit=%d&format=yaml'
- def __init__(self, lang):
- self.lang = lang
- def __fetch(self, url):
- request = urllib2.Request(url)
- request.add_header('User-Agent', 'Mozilla/5.0')
- try:
- result = urllib2.urlopen(request)
- except urllib2.HTTPError, e:
- raise WikipediaError(e.code)
- except urllib2.URLError, e:
- raise WikipediaError(e.reason)
- return result
- def article(self, article):
- url = self.url_article % (self.lang, urllib.quote_plus(article))
- content = self.__fetch(url).read()
- if content.upper().startswith('#REDIRECT'):
- match = re.match('(?i)#REDIRECT \[\[([^\[\]]+)\]\]', content)
- if not match == None:
- return self.article(match.group(1))
- raise WikipediaError('Can\'t found redirect article.')
- return content
- def image(self, image, thumb=None):
- url = self.url_image % (self.lang, image)
- result = self.__fetch(url)
- content = result.read()
- if thumb:
- url = result.geturl() + '/' + thumb + 'px-' + image
- url = url.replace('/commons/', '/commons/thumb/')
- url = url.replace('/' + self.lang + '/', '/' + self.lang + '/thumb/')
- return self.__fetch(url).read()
- return content
- def search(self, query, page=1, limit=10):
- offset = (page - 1) * limit
- url = self.url_search % (self.lang, urllib.quote_plus(query), offset, limit)
- content = self.__fetch(url).read()
- parsed = yaml.load(content)
- search = parsed['query']['search']
- results = []
- if search:
- for article in search:
- title = article['title'].strip()
- snippet = article['snippet']
- snippet = re.sub(r'(?m)<.*?>', '', snippet)
- snippet = re.sub(r'\s+', ' ', snippet)
- snippet = snippet.replace(' . ', '. ')
- snippet = snippet.replace(' , ', ', ')
- snippet = snippet.strip()
- wordcount = article['wordcount']
- results.append({
- 'title' : title,
- 'snippet' : snippet,
- 'wordcount' : wordcount
- })
- # yaml.dump(results, default_style='', default_flow_style=False,
- # allow_unicode=True)
- return results
- if __name__ == '__main__':
- wiki = Wikipedia('simple')
- wiki.article('Uruguay')
- wiki.image('Bono_at_the_2009_Tribeca_Film_Festival.jpg', '640')
- wiki.search('Wikipedia')
- print 'OK'
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement