Untitled

import requests  # Helps generating requests for webpages
from bs4 import BeautifulSoup  # Helps parsing HTML webpages
import urllib.parse


class Spider:  # spider that will get links of website
    # called to create instance of the class
    def __init__(self, base_url, max_pages=0, login_url=None, user=None, password=None):
        self.redirect_url = ""
        self.base_url = base_url  # main url of website
        self.__links_to_open = []  # list of links yet to open
        self.opened_links = []  # list of links opened/all links
        self.total = 0  # total number of links in website
        self.max_pages = max_pages  # max pages to crawl
        self.invalid_links_count = 0  # number of broken links found
        self.invalid_links_list = []  # list of broken links found
        self.login_url = login_url  # login page url if available
        self.credentials = [user, password]

    # get total number of links opened so far
    def total_links(self):
        self.total = len(self.opened_links)
        return self.total

    # check if max pages reached
    def __crawled_max__(self):
        result = (self.max_pages == 0) or (self.max_pages < self.total_links())
        return result

    # is link already listed
    def __is_link_listed__(self, link):
        url = urllib.parse.urljoin(self.base_url, link)
        result = False
        for opened in self.opened_links:
            if url == opened:
                result = True
        for to_open in self.__links_to_open:
            if link == to_open:
                result = True
        return result

    # retrieve invalid links
    def invalid_links(self):
        if self.invalid_links_count > 0:
            print("Found %s broken links:" % self.invalid_links_count)
            for link in self.invalid_links_list:
                print(link)
        else:
            print("No broken links found.")

    # check if given url belongs to website
    # i.e. is in the website's domain
    def __in_domain__(self, url):
        full_length = len(self.base_url)  # get length of base url
        url = str(url)
        result = str.find(url, self.base_url, 0, full_length)  # find if website url exists in given url
        if result == 0:  # result = 0 meaning url belongs to website
            return True
        else:
            return False

    # check for url protocol
    def __check_protocol__(self, url):
        parsed = urllib.parse.urlparse(url)  # parse url to get information from it
        protocol = str.lower(str(parsed[0]))  # get url protocol
        if protocol == "http" or protocol == "https":  # is protocol 'http' or 'https'
            return True
        else:
            self.invalid_links_count += 1
            self.invalid_links_list.append(url)
            return False

    # check if page opened and exists
    def __is_response_ok__(self, url):
        request = requests.get(url)  # open given url of page
        # status_code 200 means OK; no problems with page
        if request.status_code == 200:
            return True
        else:
            self.invalid_links_count += 1
            self.invalid_links_list.append(url)
            return False

    def __is_html_page__(self, url):
        request = requests.head(url)
        if "text/html" in request.headers["content-type"]:
            return True
        else:
            self.invalid_links_count += 1
            self.invalid_links_list.append(url)
            return False

    def __is_url_good__(self, url):
        return self.__in_domain__(url) and self.__check_protocol__(url) and self.__is_response_ok__(url) and self.__is_html_page__(url)

    # get all links in a given page
    def __get_page_links__(self, page):
        # gets a list of all <a> tags in page
        links_tags = page.find_all("a")
        # going through each link
        for link in links_tags:
            link_href = link.get("href")  # get <a> tag link reference. example: <a href="page.html"> ==> page.html
            # check that: link isn't already listed + link isn't blank
            link_listed = self.__is_link_listed__(link_href)
            if (not link_listed) and (link_href != "#"):
                # add link to list of links to open
                self.__links_to_open.append(link_href)

    # open a page and get its content
    def __open_url__(self, url):
        # get page content
        request = requests.get(url)  # open given url of page
        content = BeautifulSoup(request.content, "html.parser")
        self.__get_page_links__(content)  # send content to retrieve links from

    # main spider function; creates our spider's web
    def create_web(self):
        self.__open_url__(self.base_url)  # send main url to be opened and checked
        # while there are still links to open
        while len(self.__links_to_open) != 0 and self.__crawled_max__():
            # start from the last link in the list
            i = len(self.__links_to_open) - 1
            link = self.__links_to_open[i]
            url = urllib.parse.urljoin(self.base_url, link)  # join main url with page link
            if self.__is_url_good__(url):  # is url valid and working
                self.__open_url__(url)  # open page
                self.opened_links.append(url)  # add link to list of opened links
            self.__links_to_open.remove(link)  # remove opened link from list of links to open

    # prints all links of website
    def print_all_links(self):
        for link in self.opened_links:
            print(link)


url = "http://google.com/"
crawl = Spider(url)
crawl.create_web()

print(crawl.total_links())
print(crawl.invalid_links())