Google Search Results Page Downloader

from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from datetime import datetime
from string import ascii_letters, digits


"""
Obtains the first page of Google search results for the specified search term
in html format and writes it to an html file in the working directory.
"""

def user_input():

    """Obtains the user input as a string representing the search term.
    Verifies that all characters in the search term string are legal.

        :returns:       The search term string.
    """
    screen = digits + ascii_letters + " "
#    print(screen)
    while True:
        search_term = input("Input the search term: ")
        scan = [char in screen for char search_term]
#       print(scan)
        if False not in scan:
            return search_term
        else:
            print("Invalid character--try again.")


def parse_url(search_term):
    """Generates a Google search url from the search term. If the search
    term is more than one word, adds '+' between words. This function is
    called by get_and_write_page() below.

        :search_term:       The search term input by the user.

        :returns:               The url for the search term.
    """
    url = "https://www.google.com/search?q="
    search_term_list = search_term.split()
    if len(search_term_list) > 1:
        for index in range(len(search_term_list)-1):
            url += search_term_list[index] + "+"
        url += search_term_list[-1]
    else:
        url += search_term
    print("{} parsed to {}.".format(search_term, url))
    return url


def get_and_write_page():
    """
    Calls parse_url() above to obtain url from search term input by the
    user from user_input() above.
    Calls the check_and_get() function below to download the Google search
    page in html format.
    Writes downloaded data to disk in binary format as an html file in the
    working directory.

        :returns:           None
    """
    search_term = user_input()
    url = parse_url(search_term)
    raw_html = check_and_get(url)
    print("{} obtained from {}.".format(search_term, url))
    if raw_html is not None:
        filename = "Google search results for " + search_term +".html"
        with open(filename, "bw") as f:
            f.write(raw_html)
        print("File {} written.".format(filename))
    else:
        print("No data obtained.")

"""
The following was obtained, with minor modifications, from the article
'Practical Introduction to Web Scraping in Python', by Colin O'Keefe,
at https://realpython.com/blog/python/python-web-scraping-practical-introduction/.
It makes an URL request from a webpage and verifies that the
page contains HTML/XML data. If it doesn't, it displays an error message and
writes an error log to the working directory. If there is HTML/XML in the
webpage, check_and_get() extracts the data and returns it.
"""

def is_good_response(resp):
    """
    Returns True if the response seems to be HTML/XML, False otherwise.
    Called by check_and_get() below.

        :resp:      Response from with closing() method of check_and_get()
                    below.

        :returns:   boolean value representing whether there is HTML/XML
                    in the webpage at the URL
    """
    content_type = resp.headers["Content-Type"].lower()
    return (resp.status_code == 200 and
            content_type is not None and
            content_type.find('html') > -1)


def log_error(e):
    """In case of error, writes log file to working directory and prints
       error to console.

        :e:         Error type as returned from is_good_response() above.

        :returns:   None

    """
    logstamp = datetime.today() + "-error.log"
    with open(logstamp, 'w') as f:
        f.write(e)

    print(e)


def check_and_get(url):
    """
    Attempts to get the contents of a webpage by making an HTTP GET request.
    if the content-type response is HTML/XML, returns the text content,
    else returns None.

        :url:           Full website URL

        :returns:       HTML/XML content of webpage or None
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error("Error during request to {0} : {1}".format(url, str(e)))


if __name__=='__main__':
    get_and_write_page()