Advertisement
Anubis_MSF

pygoogle.py

Sep 24th, 2013
508
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.74 KB | None | 0 0
  1. #!/usr/bin/python
  2. """
  3. Google AJAX Search Module
  4. http://code.google.com/apis/ajaxsearch/documentation/reference.html
  5. """
  6. try:
  7.     import simplejson as json
  8. except:
  9.     import json
  10. import urllib
  11.  
  12. __author__ = "Kiran Bandla"
  13. __version__ = "0.1"
  14. URL = 'http://ajax.googleapis.com/ajax/services/search/web?'
  15.  
  16. #Web Search Specific Arguments
  17. #http://code.google.com/apis/ajaxsearch/documentation/reference.html#_fonje_web
  18. #SAFE,FILTER
  19. """
  20. SAFE
  21. This optional argument supplies the search safety level which may be one of:
  22.    * safe=active - enables the highest level of safe search filtering
  23.    * safe=moderate - enables moderate safe search filtering (default)
  24.    * safe=off - disables safe search filtering
  25. """
  26. SAFE_ACTIVE     = "active"
  27. SAFE_MODERATE   = "moderate"
  28. SAFE_OFF        = "off"
  29.  
  30. """
  31. FILTER
  32. This optional argument controls turning on or off the duplicate content filter:
  33.  
  34.    * filter=0 - Turns off the duplicate content filter
  35.    * filter=1 - Turns on the duplicate content filter (default)
  36.  
  37. """
  38. FILTER_OFF  = 0
  39. FILTER_ON   = 1
  40.  
  41. #Standard URL Arguments
  42. #http://code.google.com/apis/ajaxsearch/documentation/reference.html#_fonje_args
  43. """
  44. RSZ
  45. This optional argument supplies the number of results that the application would like to recieve.
  46. A value of small indicates a small result set size or 4 results.
  47. A value of large indicates a large result set or 8 results. If this argument is not supplied, a value of small is assumed.
  48. """
  49. RSZ_SMALL = "small"
  50. RSZ_LARGE = "large"
  51.  
  52. """
  53. HL
  54. This optional argument supplies the host language of the application making the request.
  55. If this argument is not present then the system will choose a value based on the value of the Accept-Language http header.
  56. If this header is not present, a value of en is assumed.
  57. """
  58.  
  59. class pygoogle:
  60.    
  61.     def __init__(self,query,pages=10,hl='en'):
  62.         self.pages = pages          #Number of pages. default 10
  63.         self.query = query
  64.         self.filter = FILTER_ON     #Controls turning on or off the duplicate content filter. On = 1.
  65.         self.rsz = RSZ_LARGE        #Results per page. small = 4 /large = 8
  66.         self.safe = SAFE_OFF        #SafeBrowsing -  active/moderate/off
  67.         self.hl = hl                #Defaults to English (en)
  68.        
  69.     def __search__(self,print_results = False):
  70.         results = []
  71.         for page in range(0,self.pages):
  72.             rsz = 8
  73.             if self.rsz == RSZ_SMALL:
  74.                 rsz = 4
  75.             args = {'q' : self.query,
  76.                     'v' : '1.0',
  77.                     'start' : page*rsz,
  78.                     'rsz': self.rsz,
  79.                     'safe' : self.safe,
  80.                     'filter' : self.filter,    
  81.                     'hl'    : self.hl
  82.                     }
  83.             q = urllib.urlencode(args)
  84.             search_results = urllib.urlopen(URL+q)
  85.             data = json.loads(search_results.read())
  86.             if print_results:
  87.                 if data['responseStatus'] == 200:
  88.                     for result in  data['responseData']['results']:
  89.                         if result:
  90.                             print '[%s]'%(urllib.unquote(result['titleNoFormatting']))
  91.                             print result['content'].strip("<b>...</b>").replace("<b>",'').replace("</b>",'').replace("&#39;","'").strip()
  92.                             print urllib.unquote(result['unescapedUrl'])+'\n'                
  93.             results.append(data)
  94.         return results
  95.    
  96.     def search(self):
  97.         """Returns a dict of Title/URLs"""
  98.         results = {}
  99.         for data in self.__search__():
  100.             for result in  data['responseData']['results']:
  101.                 if result:
  102.                     title = urllib.unquote(result['titleNoFormatting'])
  103.                     results[title] = urllib.unquote(result['unescapedUrl'])
  104.         return results
  105.  
  106.     def search_page_wise(self):
  107.         """Returns a dict of page-wise urls"""
  108.         results = {}
  109.         for page in range(0,self.pages):
  110.             args = {'q' : self.query,
  111.                     'v' : '1.0',
  112.                     'start' : page,
  113.                     'rsz': RSZ_LARGE,
  114.                     'safe' : SAFE_OFF,
  115.                     'filter' : FILTER_ON,    
  116.                     }
  117.             q = urllib.urlencode(args)
  118.             search_results = urllib.urlopen(URL+q)
  119.             data = json.loads(search_results.read())
  120.             urls = []
  121.             for result in  data['responseData']['results']:
  122.                 if result:
  123.                     url = urllib.unquote(result['unescapedUrl'])
  124.                     urls.append(url)            
  125.             results[page] = urls
  126.         return results
  127.        
  128.     def get_urls(self):
  129.         """Returns list of result URLs"""
  130.         results = []
  131.         for data in self.__search__():
  132.             for result in  data['responseData']['results']:
  133.                 if result:
  134.                     results.append(urllib.unquote(result['unescapedUrl']))
  135.         return results
  136.  
  137.     def get_result_count(self):
  138.         """Returns the number of results"""
  139.         temp = self.pages
  140.         self.pages = 1
  141.         result_count = 0
  142.         try:
  143.             result_count = self.__search__()[0]['responseData']['cursor']['estimatedResultCount']
  144.         except Exception,e:
  145.             print e
  146.         finally:
  147.             self.pages = temp
  148.         return result_count
  149.        
  150.     def display_results(self):
  151.         """Prints results (for command line)"""
  152.         self.__search__(True)
  153.  
  154.    
  155. if __name__ == "__main__":
  156.     import sys
  157.     query = ' '.join(sys.argv[1:])
  158.     #print pygoogle(' '.join(sys.argv[1:])).display_results()
  159.     g = pygoogle(query)
  160.     print '*Found %s results*'%(g.get_result_count())
  161.     g.pages = 1
  162.     g.display_results()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement