SHARE
TWEET

wikipedia.py

a guest Dec 16th, 2010 7,204 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python
  2.  
  3. import re
  4. import yaml
  5. import urllib
  6. import urllib2
  7.  
  8. class WikipediaError(Exception):
  9.     pass
  10.  
  11. class Wikipedia:
  12.     url_article = 'http://%s.wikipedia.org/w/index.php?action=raw&title=%s'
  13.     url_image = 'http://%s.wikipedia.org/w/index.php?title=Special:FilePath&file=%s'
  14.     url_search = 'http://%s.wikipedia.org/w/api.php?action=query&list=search&srsearch=%s&sroffset=%d&srlimit=%d&format=yaml'
  15.    
  16.     def __init__(self, lang):
  17.         self.lang = lang
  18.    
  19.     def __fetch(self, url):
  20.         request = urllib2.Request(url)
  21.         request.add_header('User-Agent', 'Mozilla/5.0')
  22.        
  23.         try:
  24.             result = urllib2.urlopen(request)
  25.         except urllib2.HTTPError, e:
  26.             raise WikipediaError(e.code)
  27.         except urllib2.URLError, e:
  28.             raise WikipediaError(e.reason)
  29.        
  30.         return result
  31.    
  32.     def article(self, article):
  33.         url = self.url_article % (self.lang, urllib.quote_plus(article))
  34.         content = self.__fetch(url).read()
  35.        
  36.         if content.upper().startswith('#REDIRECT'):
  37.             match = re.match('(?i)#REDIRECT \[\[([^\[\]]+)\]\]', content)
  38.            
  39.             if not match == None:
  40.                 return self.article(match.group(1))
  41.            
  42.             raise WikipediaError('Can\'t found redirect article.')
  43.        
  44.         return content
  45.    
  46.     def image(self, image, thumb=None):
  47.         url = self.url_image % (self.lang, image)
  48.         result = self.__fetch(url)
  49.         content = result.read()
  50.        
  51.         if thumb:
  52.             url = result.geturl() + '/' + thumb + 'px-' + image
  53.             url = url.replace('/commons/', '/commons/thumb/')
  54.             url = url.replace('/' + self.lang + '/', '/' + self.lang + '/thumb/')
  55.            
  56.             return self.__fetch(url).read()
  57.        
  58.         return content
  59.    
  60.     def search(self, query, page=1, limit=10):
  61.         offset = (page - 1) * limit
  62.         url = self.url_search % (self.lang, urllib.quote_plus(query), offset, limit)
  63.         content = self.__fetch(url).read()
  64.        
  65.         parsed = yaml.load(content)
  66.         search = parsed['query']['search']
  67.        
  68.         results = []
  69.        
  70.         if search:
  71.             for article in search:
  72.                 title = article['title'].strip()
  73.                
  74.                 snippet = article['snippet']
  75.                 snippet = re.sub(r'(?m)<.*?>', '', snippet)
  76.                 snippet = re.sub(r'\s+', ' ', snippet)
  77.                 snippet = snippet.replace(' . ', '. ')
  78.                 snippet = snippet.replace(' , ', ', ')
  79.                 snippet = snippet.strip()
  80.                
  81.                 wordcount = article['wordcount']
  82.                
  83.                 results.append({
  84.                     'title' : title,
  85.                     'snippet' : snippet,
  86.                     'wordcount' : wordcount
  87.                 })
  88.        
  89.         # yaml.dump(results, default_style='', default_flow_style=False,
  90.         #     allow_unicode=True)
  91.         return results
  92.  
  93. if __name__ == '__main__':
  94.     wiki = Wikipedia('simple')
  95.     wiki.article('Uruguay')
  96.     wiki.image('Bono_at_the_2009_Tribeca_Film_Festival.jpg', '640')
  97.     wiki.search('Wikipedia')
  98.    
  99.     print 'OK'
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top