Advertisement
Guest User

Untitled

a guest
Feb 25th, 2014
148
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.88 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. # Python bindings to the Google search engine
  4. # Copyright (c) 2009-2013, Mario Vilas
  5. # All rights reserved.
  6. #
  7. # Redistribution and use in source and binary forms, with or without
  8. # modification, are permitted provided that the following conditions are met:
  9. #
  10. #     * Redistributions of source code must retain the above copyright notice,
  11. #       this list of conditions and the following disclaimer.
  12. #     * Redistributions in binary form must reproduce the above copyright
  13. #       notice,this list of conditions and the following disclaimer in the
  14. #       documentation and/or other materials provided with the distribution.
  15. #     * Neither the name of the copyright holder nor the names of its
  16. #       contributors may be used to endorse or promote products derived from
  17. #       this software without specific prior written permission.
  18. #
  19. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  20. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  23. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29. # POSSIBILITY OF SUCH DAMAGE.
  30.  
  31. __all__ = ['search']
  32.  
  33. import os
  34. import sys
  35. import time
  36.  
  37. if sys.version_info[0] > 2:
  38.     from http.cookiejar import LWPCookieJar
  39.     from urllib.request import Request, urlopen
  40.     from urllib.parse import quote_plus, urlparse, parse_qs
  41. else:
  42.     from cookielib import LWPCookieJar
  43.     from urllib import quote_plus
  44.     from urllib2 import Request, urlopen
  45.     from urlparse import urlparse, parse_qs
  46.  
  47. # Lazy import of BeautifulSoup.
  48. BeautifulSoup = None
  49.  
  50. # URL templates to make Google searches.
  51. url_home          = "http://www.google.%(tld)s/"
  52. url_search        = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&btnG=Google+Search"
  53. url_next_page     = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&start=%(start)d"
  54. url_search_num    = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&btnG=Google+Search"
  55. url_next_page_num = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d"
  56.  
  57. # Cookie jar. Stored at the user's home folder.
  58. home_folder = os.getenv('HOME')
  59. if not home_folder:
  60.     home_folder = os.getenv('USERHOME')
  61.     if not home_folder:
  62.         home_folder = '.'   # Use the current folder on error.
  63. cookie_jar = LWPCookieJar(os.path.join(home_folder, '.google-cookie'))
  64. try:
  65.     cookie_jar.load()
  66. except Exception:
  67.     pass
  68.  
  69. # Request the given URL and return the response page, using the cookie jar.
  70. def get_page(url):
  71.     """
  72.    Request the given URL and return the response page, using the cookie jar.
  73.  
  74.    @type  url: str
  75.    @param url: URL to retrieve.
  76.  
  77.    @rtype:  str
  78.    @return: Web page retrieved for the given URL.
  79.  
  80.    @raise IOError: An exception is raised on error.
  81.    @raise urllib2.URLError: An exception is raised on error.
  82.    @raise urllib2.HTTPError: An exception is raised on error.
  83.    """
  84.     request = Request(url)
  85.     request.add_header('User-Agent',
  86.                        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)')
  87.     cookie_jar.add_cookie_header(request)
  88.     response = urlopen(request)
  89.     cookie_jar.extract_cookies(response, request)
  90.     html = response.read()
  91.     response.close()
  92.     cookie_jar.save()
  93.     return html
  94.  
  95. # Filter links found in the Google result pages HTML code.
  96. # Returns None if the link doesn't yield a valid result.
  97. def filter_result(link):
  98.     try:
  99.  
  100.         # Valid results are absolute URLs not pointing to a Google domain
  101.         # like images.google.com or googleusercontent.com
  102.         o = urlparse(link, 'http')
  103.         if o.netloc and 'google' not in o.netloc:
  104.             return link
  105.  
  106.         # Decode hidden URLs.
  107.         if link.startswith('/url?'):
  108.             link = parse_qs(o.query)['q'][0]
  109.  
  110.             # Valid results are absolute URLs not pointing to a Google domain
  111.             # like images.google.com or googleusercontent.com
  112.             o = urlparse(link, 'http')
  113.             if o.netloc and 'google' not in o.netloc:
  114.                 return link
  115.  
  116.     # Otherwise, or on error, return None.
  117.     except Exception:
  118.         pass
  119.     return None
  120.  
  121. # Returns a generator that yields URLs.
  122. def search(query, tld='com', lang='en', num=10, start=0, stop=None, pause=2.0):
  123.     """
  124.    Search the given query string using Google.
  125.  
  126.    @type  query: str
  127.    @param query: Query string. Must NOT be url-encoded.
  128.  
  129.    @type  tld: str
  130.    @param tld: Top level domain.
  131.  
  132.    @type  lang: str
  133.    @param lang: Languaje.
  134.  
  135.    @type  num: int
  136.    @param num: Number of results per page.
  137.  
  138.    @type  start: int
  139.    @param start: First result to retrieve.
  140.  
  141.    @type  stop: int
  142.    @param stop: Last result to retrieve.
  143.        Use C{None} to keep searching forever.
  144.  
  145.    @type  pause: float
  146.    @param pause: Lapse to wait between HTTP requests.
  147.        A lapse too long will make the search slow, but a lapse too short may
  148.        cause Google to block your IP. Your mileage may vary!
  149.  
  150.    @rtype:  generator
  151.    @return: Generator (iterator) that yields found URLs. If the C{stop}
  152.        parameter is C{None} the iterator will loop forever.
  153.    """
  154.  
  155.     # Lazy import of BeautifulSoup.
  156.     # Try to use BeautifulSoup 4 if available, fall back to 3 otherwise.
  157.     global BeautifulSoup
  158.     if BeautifulSoup is None:
  159.         try:
  160.             from bs4 import BeautifulSoup
  161.         except ImportError:
  162.             from BeautifulSoup import BeautifulSoup
  163.  
  164.     # Set of hashes for the results found.
  165.     # This is used to avoid repeated results.
  166.     hashes = set()
  167.  
  168.     # Prepare the search string.
  169.     query = quote_plus(query)
  170.  
  171.     # Grab the cookie from the home page.
  172.     get_page(url_home % vars())
  173.  
  174.     # Prepare the URL of the first request.
  175.     if start:
  176.         if num == 10:
  177.             url = url_next_page % vars()
  178.         else:
  179.             url = url_next_page_num % vars()
  180.     else:
  181.         if num == 10:
  182.             url = url_search % vars()
  183.         else:
  184.             url = url_search_num % vars()
  185.  
  186.     # Loop until we reach the maximum result, if any (otherwise, loop forever).
  187.     while not stop or start < stop:
  188.  
  189.         # Sleep between requests.
  190.         time.sleep(pause)
  191.  
  192.         # Request the Google Search results page.
  193.         html = get_page(url)
  194.  
  195.         # Parse the response and process every anchored URL.
  196.         soup = BeautifulSoup(html)
  197.         anchors = soup.find(id='search').findAll('a')
  198.         for a in anchors:
  199.  
  200.             # Get the URL from the anchor tag.
  201.             try:
  202.                 link = a['href']
  203.             except KeyError:
  204.                 continue
  205.  
  206.             # Filter invalid links and links pointing to Google itself.
  207.             link = filter_result(link)
  208.             if not link:
  209.                 continue
  210.  
  211.             # Discard repeated results.
  212.             h = hash(link)
  213.             if h in hashes:
  214.                 continue
  215.             hashes.add(h)
  216.  
  217.             # Yield the result.
  218.             yield link
  219.  
  220.         # End if there are no more results.
  221.         if not soup.find(id='nav'):
  222.             break
  223.  
  224.         # Prepare the URL for the next request.
  225.         start += num
  226.         if num == 10:
  227.             url = url_next_page % vars()
  228.         else:
  229.             url = url_next_page_num % vars()
  230.  
  231. # When run as a script...
  232. if __name__ == "__main__":
  233.  
  234.     from optparse import OptionParser, IndentedHelpFormatter
  235.  
  236.     class BannerHelpFormatter(IndentedHelpFormatter):
  237.         "Just a small tweak to optparse to be able to print a banner."
  238.         def __init__(self, banner, *argv, **argd):
  239.             self.banner = banner
  240.             IndentedHelpFormatter.__init__(self, *argv, **argd)
  241.         def format_usage(self, usage):
  242.             msg = IndentedHelpFormatter.format_usage(self, usage)
  243.             return '%s\n%s' % (self.banner, msg)
  244.  
  245.     # Parse the command line arguments.
  246.     formatter = BannerHelpFormatter(
  247.         "Python script to use the Google search engine\n"
  248.         "By Mario Vilas (mvilas at gmail dot com)\n"
  249.         "https://github.com/MarioVilas/google\n"
  250.     )
  251.     parser = OptionParser(formatter=formatter)
  252.     parser.set_usage("%prog [options] query")
  253.     parser.add_option("--tld", metavar="TLD", type="string", default="com",
  254.                       help="top level domain to use [default: com]")
  255.     parser.add_option("--lang", metavar="LANGUAGE", type="string", default="en",
  256.                       help="produce results in the given language [default: en]")
  257.     parser.add_option("--num", metavar="NUMBER", type="int", default=10,
  258.                       help="number of results per page [default: 10]")
  259.     parser.add_option("--start", metavar="NUMBER", type="int", default=0,
  260.                       help="first result to retrieve [default: 0]")
  261.     parser.add_option("--stop", metavar="NUMBER", type="int", default=0,
  262.                       help="last result to retrieve [default: unlimited]")
  263.     parser.add_option("--pause", metavar="SECONDS", type="float", default=2.0,
  264.                       help="pause between HTTP requests [default: 2.0]")
  265.     (options, args) = parser.parse_args()
  266.     query = ' '.join(args)
  267.     if not query:
  268.         parser.print_help()
  269.         sys.exit(2)
  270.     params = [(k,v) for (k,v) in options.__dict__.items() if not k.startswith('_')]
  271.     params = dict(params)
  272.  
  273.     # Run the query.
  274.     for url in search(query, **params):
  275.         print(url)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement