Guest User

wikipedia.py

a guest
Dec 16th, 2010
7,945
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python
  2.  
  3. import re
  4. import yaml
  5. import urllib
  6. import urllib2
  7.  
  8. class WikipediaError(Exception):
  9.     pass
  10.  
  11. class Wikipedia:
  12.     url_article = 'http://%s.wikipedia.org/w/index.php?action=raw&title=%s'
  13.     url_image = 'http://%s.wikipedia.org/w/index.php?title=Special:FilePath&file=%s'
  14.     url_search = 'http://%s.wikipedia.org/w/api.php?action=query&list=search&srsearch=%s&sroffset=%d&srlimit=%d&format=yaml'
  15.    
  16.     def __init__(self, lang):
  17.         self.lang = lang
  18.    
  19.     def __fetch(self, url):
  20.         request = urllib2.Request(url)
  21.         request.add_header('User-Agent', 'Mozilla/5.0')
  22.        
  23.         try:
  24.             result = urllib2.urlopen(request)
  25.         except urllib2.HTTPError, e:
  26.             raise WikipediaError(e.code)
  27.         except urllib2.URLError, e:
  28.             raise WikipediaError(e.reason)
  29.        
  30.         return result
  31.    
  32.     def article(self, article):
  33.         url = self.url_article % (self.lang, urllib.quote_plus(article))
  34.         content = self.__fetch(url).read()
  35.        
  36.         if content.upper().startswith('#REDIRECT'):
  37.             match = re.match('(?i)#REDIRECT \[\[([^\[\]]+)\]\]', content)
  38.            
  39.             if not match == None:
  40.                 return self.article(match.group(1))
  41.            
  42.             raise WikipediaError('Can\'t found redirect article.')
  43.        
  44.         return content
  45.    
  46.     def image(self, image, thumb=None):
  47.         url = self.url_image % (self.lang, image)
  48.         result = self.__fetch(url)
  49.         content = result.read()
  50.        
  51.         if thumb:
  52.             url = result.geturl() + '/' + thumb + 'px-' + image
  53.             url = url.replace('/commons/', '/commons/thumb/')
  54.             url = url.replace('/' + self.lang + '/', '/' + self.lang + '/thumb/')
  55.            
  56.             return self.__fetch(url).read()
  57.        
  58.         return content
  59.    
  60.     def search(self, query, page=1, limit=10):
  61.         offset = (page - 1) * limit
  62.         url = self.url_search % (self.lang, urllib.quote_plus(query), offset, limit)
  63.         content = self.__fetch(url).read()
  64.        
  65.         parsed = yaml.load(content)
  66.         search = parsed['query']['search']
  67.        
  68.         results = []
  69.        
  70.         if search:
  71.             for article in search:
  72.                 title = article['title'].strip()
  73.                
  74.                 snippet = article['snippet']
  75.                 snippet = re.sub(r'(?m)<.*?>', '', snippet)
  76.                 snippet = re.sub(r'\s+', ' ', snippet)
  77.                 snippet = snippet.replace(' . ', '. ')
  78.                 snippet = snippet.replace(' , ', ', ')
  79.                 snippet = snippet.strip()
  80.                
  81.                 wordcount = article['wordcount']
  82.                
  83.                 results.append({
  84.                     'title' : title,
  85.                     'snippet' : snippet,
  86.                     'wordcount' : wordcount
  87.                 })
  88.        
  89.         # yaml.dump(results, default_style='', default_flow_style=False,
  90.         #     allow_unicode=True)
  91.         return results
  92.  
  93. if __name__ == '__main__':
  94.     wiki = Wikipedia('simple')
  95.     wiki.article('Uruguay')
  96.     wiki.image('Bono_at_the_2009_Tribeca_Film_Festival.jpg', '640')
  97.     wiki.search('Wikipedia')
  98.    
  99.     print 'OK'
RAW Paste Data