MickeyLater

Google Search Results Page Downloader

Jan 24th, 2018
65
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.54 KB | None | 0 0
  1. from requests import get
  2. from requests.exceptions import RequestException
  3. from contextlib import closing
  4. from bs4 import BeautifulSoup
  5. from datetime import datetime
  6. from string import ascii_letters, digits
  7.  
  8.  
  9. """
  10. Obtains the first page of Google search results for the specified search term
  11. in html format and writes it to an html file in the working directory.
  12. """
  13.  
  14. def user_input():
  15.  
  16.     """Obtains the user input as a string representing the search term.
  17.    Verifies that all characters in the search term string are legal.
  18.  
  19.        :returns:       The search term string.
  20.    """
  21.     screen = digits + ascii_letters + " "
  22. #    print(screen)
  23.     while True:
  24.         search_term = input("Input the search term: ")
  25.         scan = [char in screen for char search_term]
  26. #       print(scan)
  27.         if False not in scan:
  28.             return search_term
  29.         else:
  30.             print("Invalid character--try again.")
  31.  
  32.  
  33. def parse_url(search_term):
  34.     """Generates a Google search url from the search term. If the search
  35.    term is more than one word, adds '+' between words. This function is
  36.    called by get_and_write_page() below.
  37.  
  38.        :search_term:       The search term input by the user.
  39.  
  40.        :returns:               The url for the search term.
  41.    """
  42.     url = "https://www.google.com/search?q="
  43.     search_term_list = search_term.split()
  44.     if len(search_term_list) > 1:
  45.         for index in range(len(search_term_list)-1):
  46.             url += search_term_list[index] + "+"
  47.         url += search_term_list[-1]
  48.     else:
  49.         url += search_term
  50.     print("{} parsed to {}.".format(search_term, url))
  51.     return url
  52.  
  53.  
  54. def get_and_write_page():
  55.     """
  56.    Calls parse_url() above to obtain url from search term input by the
  57.    user from user_input() above.
  58.    Calls the check_and_get() function below to download the Google search
  59.    page in html format.
  60.    Writes downloaded data to disk in binary format as an html file in the
  61.    working directory.
  62.  
  63.        :returns:           None
  64.    """
  65.     search_term = user_input()
  66.     url = parse_url(search_term)
  67.     raw_html = check_and_get(url)
  68.     print("{} obtained from {}.".format(search_term, url))
  69.     if raw_html is not None:
  70.         filename = "Google search results for " + search_term +".html"
  71.         with open(filename, "bw") as f:
  72.             f.write(raw_html)
  73.         print("File {} written.".format(filename))
  74.     else:
  75.         print("No data obtained.")
  76.  
  77. """
  78. The following was obtained, with minor modifications, from the article
  79. 'Practical Introduction to Web Scraping in Python', by Colin O'Keefe,
  80. at https://realpython.com/blog/python/python-web-scraping-practical-introduction/.
  81. It makes an URL request from a webpage and verifies that the
  82. page contains HTML/XML data. If it doesn't, it displays an error message and
  83. writes an error log to the working directory. If there is HTML/XML in the
  84. webpage, check_and_get() extracts the data and returns it.
  85. """
  86.  
  87. def is_good_response(resp):
  88.     """
  89.    Returns True if the response seems to be HTML/XML, False otherwise.
  90.    Called by check_and_get() below.
  91.  
  92.        :resp:      Response from with closing() method of check_and_get()
  93.                    below.
  94.  
  95.        :returns:   boolean value representing whether there is HTML/XML
  96.                    in the webpage at the URL
  97.    """
  98.     content_type = resp.headers["Content-Type"].lower()
  99.     return (resp.status_code == 200 and
  100.             content_type is not None and
  101.             content_type.find('html') > -1)
  102.  
  103.  
  104. def log_error(e):
  105.     """In case of error, writes log file to working directory and prints
  106.       error to console.
  107.  
  108.        :e:         Error type as returned from is_good_response() above.
  109.  
  110.        :returns:   None
  111.  
  112.    """
  113.     logstamp = datetime.today() + "-error.log"
  114.     with open(logstamp, 'w') as f:
  115.         f.write(e)
  116.  
  117.     print(e)
  118.  
  119.  
  120. def check_and_get(url):
  121.     """
  122.    Attempts to get the contents of a webpage by making an HTTP GET request.
  123.    if the content-type response is HTML/XML, returns the text content,
  124.    else returns None.
  125.  
  126.        :url:           Full website URL
  127.  
  128.        :returns:       HTML/XML content of webpage or None
  129.    """
  130.     try:
  131.         with closing(get(url, stream=True)) as resp:
  132.             if is_good_response(resp):
  133.                 return resp.content
  134.             else:
  135.                 return None
  136.  
  137.     except RequestException as e:
  138.         log_error("Error during request to {0} : {1}".format(url, str(e)))
  139.  
  140.  
  141. if __name__=='__main__':
  142.     get_and_write_page()
Add Comment
Please, Sign In to add comment