Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from requests import get
- from requests.exceptions import RequestException
- from contextlib import closing
- from bs4 import BeautifulSoup
- from datetime import datetime
- from string import ascii_letters, digits
- """
- Obtains the first page of Google search results for the specified search term
- in html format and writes it to an html file in the working directory.
- """
- def user_input():
- """Obtains the user input as a string representing the search term.
- Verifies that all characters in the search term string are legal.
- :returns: The search term string.
- """
- screen = digits + ascii_letters + " "
- # print(screen)
- while True:
- search_term = input("Input the search term: ")
- scan = [char in screen for char search_term]
- # print(scan)
- if False not in scan:
- return search_term
- else:
- print("Invalid character--try again.")
- def parse_url(search_term):
- """Generates a Google search url from the search term. If the search
- term is more than one word, adds '+' between words. This function is
- called by get_and_write_page() below.
- :search_term: The search term input by the user.
- :returns: The url for the search term.
- """
- url = "https://www.google.com/search?q="
- search_term_list = search_term.split()
- if len(search_term_list) > 1:
- for index in range(len(search_term_list)-1):
- url += search_term_list[index] + "+"
- url += search_term_list[-1]
- else:
- url += search_term
- print("{} parsed to {}.".format(search_term, url))
- return url
- def get_and_write_page():
- """
- Calls parse_url() above to obtain url from search term input by the
- user from user_input() above.
- Calls the check_and_get() function below to download the Google search
- page in html format.
- Writes downloaded data to disk in binary format as an html file in the
- working directory.
- :returns: None
- """
- search_term = user_input()
- url = parse_url(search_term)
- raw_html = check_and_get(url)
- print("{} obtained from {}.".format(search_term, url))
- if raw_html is not None:
- filename = "Google search results for " + search_term +".html"
- with open(filename, "bw") as f:
- f.write(raw_html)
- print("File {} written.".format(filename))
- else:
- print("No data obtained.")
- """
- The following was obtained, with minor modifications, from the article
- 'Practical Introduction to Web Scraping in Python', by Colin O'Keefe,
- at https://realpython.com/blog/python/python-web-scraping-practical-introduction/.
- It makes an URL request from a webpage and verifies that the
- page contains HTML/XML data. If it doesn't, it displays an error message and
- writes an error log to the working directory. If there is HTML/XML in the
- webpage, check_and_get() extracts the data and returns it.
- """
- def is_good_response(resp):
- """
- Returns True if the response seems to be HTML/XML, False otherwise.
- Called by check_and_get() below.
- :resp: Response from with closing() method of check_and_get()
- below.
- :returns: boolean value representing whether there is HTML/XML
- in the webpage at the URL
- """
- content_type = resp.headers["Content-Type"].lower()
- return (resp.status_code == 200 and
- content_type is not None and
- content_type.find('html') > -1)
- def log_error(e):
- """In case of error, writes log file to working directory and prints
- error to console.
- :e: Error type as returned from is_good_response() above.
- :returns: None
- """
- logstamp = datetime.today() + "-error.log"
- with open(logstamp, 'w') as f:
- f.write(e)
- print(e)
- def check_and_get(url):
- """
- Attempts to get the contents of a webpage by making an HTTP GET request.
- if the content-type response is HTML/XML, returns the text content,
- else returns None.
- :url: Full website URL
- :returns: HTML/XML content of webpage or None
- """
- try:
- with closing(get(url, stream=True)) as resp:
- if is_good_response(resp):
- return resp.content
- else:
- return None
- except RequestException as e:
- log_error("Error during request to {0} : {1}".format(url, str(e)))
- if __name__=='__main__':
- get_and_write_page()
Add Comment
Please, Sign In to add comment