Advertisement
ahmadhy

gldk.py

Feb 20th, 2018
202
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.37 KB | None | 0 0
  1. #!/usr/bin/python
  2. """
  3. Google AJAX Search Module
  4. http://code.google.com/apis/ajaxsearch/documentation/reference.html
  5. """
  6. try:
  7.     import simplejson as json
  8. except:
  9.     import json
  10. import urllib
  11.  
  12. __author__ = "Kiran Bandla"
  13. __version__ = "0.1"
  14. URL = 'http://ajax.googleapis.com/ajax/services/search/web?'
  15.  
  16. #Web Search Specific Arguments
  17. #http://code.google.com/apis/ajaxsearch/documentation/reference.html#_fonje_web
  18. #SAFE,FILTER
  19. """
  20. SAFE
  21. This optional argument supplies the search safety level which may be one of:
  22.    * safe=active - enables the highest level of safe search filtering
  23.    * safe=moderate - enables moderate safe search filtering (default)
  24.    * safe=off - disables safe search filtering
  25. """
  26. SAFE_ACTIVE     = "active"
  27. SAFE_MODERATE   = "moderate"
  28. SAFE_OFF        = "off"
  29.  
  30. """
  31. FILTER
  32. This optional argument controls turning on or off the duplicate content filter:
  33.  
  34.    * filter=0 - Turns off the duplicate content filter
  35.    * filter=1 - Turns on the duplicate content filter (default)
  36.  
  37. """
  38. FILTER_OFF  = 0
  39. FILTER_ON   = 1
  40.  
  41. #Standard URL Arguments
  42. #http://code.google.com/apis/ajaxsearch/documentation/reference.html#_fonje_args
  43. """
  44. RSZ
  45. This optional argument supplies the number of results that the application would like to recieve.
  46. A value of small indicates a small result set size or 4 results.
  47. A value of large indicates a large result set or 8 results. If this argument is not supplied, a value of small is assumed.
  48. """
  49. RSZ_SMALL = "small"
  50. RSZ_LARGE = "large"
  51.  
  52.  
  53. class pygoogle:
  54.    
  55.     def __init__(self,query,pages=10):
  56.         self.pages = pages          #Number of pages. default 10
  57.         self.query = query
  58.         self.filter = FILTER_ON     #Controls turning on or off the duplicate content filter. On = 1.
  59.         self.rsz = RSZ_LARGE        #Results per page. small = 4 /large = 8
  60.         self.safe = SAFE_OFF        #SafeBrowsing -  active/moderate/off
  61.        
  62.     def __search__(self,print_results = False):
  63.         results = []
  64.         for page in range(0,self.pages):
  65.             rsz = 8
  66.             if self.rsz == RSZ_SMALL:
  67.                 rsz = 4
  68.             args = {'q' : self.query,
  69.                     'v' : '1.0',
  70.                     'start' : page*rsz,
  71.                     'rsz': self.rsz,
  72.                     'safe' : self.safe,
  73.                     'filter' : self.filter,    
  74.                     }
  75.             q = urllib.urlencode(args)
  76.             search_results = urllib.urlopen(URL+q)
  77.             data = json.loads(search_results.read())
  78.             if print_results:
  79.                 if data['responseStatus'] == 200:
  80.                     for result in  data['responseData']['results']:
  81.                         if result:
  82.                             print '[%s]'%(urllib.unquote(result['titleNoFormatting']))
  83.                             print result['content'].strip("<b>...</b>").replace("<b>",'').replace("</b>",'').replace("&#39;","'").strip()
  84.                             print urllib.unquote(result['unescapedUrl'])+'\n'                
  85.             results.append(data)
  86.         return results
  87.    
  88.     def search(self):
  89.         """Returns a dict of Title/URLs"""
  90.         results = {}
  91.         for data in self.__search__():
  92.             for result in  data['responseData']['results']:
  93.                 if result:
  94.                     title = urllib.unquote(result['titleNoFormatting'])
  95.                     results[title] = urllib.unquote(result['unescapedUrl'])
  96.         return results
  97.  
  98.     def search_page_wise(self):
  99.         """Returns a dict of page-wise urls"""
  100.         results = {}
  101.         for page in range(0,self.pages):
  102.             args = {'q' : self.query,
  103.                     'v' : '1.0',
  104.                     'start' : page,
  105.                     'rsz': RSZ_LARGE,
  106.                     'safe' : SAFE_OFF,
  107.                     'filter' : FILTER_ON,    
  108.                     }
  109.             q = urllib.urlencode(args)
  110.             search_results = urllib.urlopen(URL+q)
  111.             data = json.loads(search_results.read())
  112.             urls = []
  113.             for result in  data['responseData']['results']:
  114.                 if result:
  115.                     url = urllib.unquote(result['unescapedUrl'])
  116.                     urls.append(url)            
  117.             results[page] = urls
  118.         return results
  119.        
  120.     def get_urls(self):
  121.         """Returns list of result URLs"""
  122.         results = []
  123.         for data in self.__search__():
  124.             for result in  data['responseData']['results']:
  125.                 if result:
  126.                     results.append(urllib.unquote(result['unescapedUrl']))
  127.         return results
  128.  
  129.     def get_result_count(self):
  130.         """Returns the number of results"""
  131.         temp = self.pages
  132.         self.pages = 1
  133.         result_count = 0
  134.         try:
  135.             result_count = self.__search__()[0]['responseData']['cursor']['estimatedResultCount']
  136.         except Exception,e:
  137.             print e
  138.         finally:
  139.             self.pages = temp
  140.         return result_count
  141.        
  142.     def display_results(self):
  143.         """Prints results (for command line)"""
  144.         self.__search__(True)
  145.  
  146.    
  147. if __name__ == "__main__":
  148.     import sys
  149.     query = ' '.join(sys.argv[1:])
  150.     #print pygoogle(' '.join(sys.argv[1:])).display_results()
  151.     g = pygoogle(query)
  152.     print '*Found %s results*'%(g.get_result_count())
  153.     g.pages = 1
  154.     g.display_results()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement