Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests # Helps generating requests for webpages
- from bs4 import BeautifulSoup # Helps parsing HTML webpages
- import urllib.parse
- class Spider: # spider that will get links of website
- # called to create instance of the class
- def __init__(self, base_url, max_pages=0, login_url=None, user=None, password=None):
- self.redirect_url = ""
- self.base_url = base_url # main url of website
- self.__links_to_open = [] # list of links yet to open
- self.opened_links = [] # list of links opened/all links
- self.total = 0 # total number of links in website
- self.max_pages = max_pages # max pages to crawl
- self.invalid_links_count = 0 # number of broken links found
- self.invalid_links_list = [] # list of broken links found
- self.login_url = login_url # login page url if available
- self.credentials = [user, password]
- # get total number of links opened so far
- def total_links(self):
- self.total = len(self.opened_links)
- return self.total
- # check if max pages reached
- def __crawled_max__(self):
- result = (self.max_pages == 0) or (self.max_pages < self.total_links())
- return result
- # is link already listed
- def __is_link_listed__(self, link):
- url = urllib.parse.urljoin(self.base_url, link)
- result = False
- for opened in self.opened_links:
- if url == opened:
- result = True
- for to_open in self.__links_to_open:
- if link == to_open:
- result = True
- return result
- # retrieve invalid links
- def invalid_links(self):
- if self.invalid_links_count > 0:
- print("Found %s broken links:" % self.invalid_links_count)
- for link in self.invalid_links_list:
- print(link)
- else:
- print("No broken links found.")
- # check if given url belongs to website
- # i.e. is in the website's domain
- def __in_domain__(self, url):
- full_length = len(self.base_url) # get length of base url
- url = str(url)
- result = str.find(url, self.base_url, 0, full_length) # find if website url exists in given url
- if result == 0: # result = 0 meaning url belongs to website
- return True
- else:
- return False
- # check for url protocol
- def __check_protocol__(self, url):
- parsed = urllib.parse.urlparse(url) # parse url to get information from it
- protocol = str.lower(str(parsed[0])) # get url protocol
- if protocol == "http" or protocol == "https": # is protocol 'http' or 'https'
- return True
- else:
- self.invalid_links_count += 1
- self.invalid_links_list.append(url)
- return False
- # check if page opened and exists
- def __is_response_ok__(self, url):
- request = requests.get(url) # open given url of page
- # status_code 200 means OK; no problems with page
- if request.status_code == 200:
- return True
- else:
- self.invalid_links_count += 1
- self.invalid_links_list.append(url)
- return False
- def __is_html_page__(self, url):
- request = requests.head(url)
- if "text/html" in request.headers["content-type"]:
- return True
- else:
- self.invalid_links_count += 1
- self.invalid_links_list.append(url)
- return False
- def __is_url_good__(self, url):
- return self.__in_domain__(url) and self.__check_protocol__(url) and self.__is_response_ok__(url) and self.__is_html_page__(url)
- # get all links in a given page
- def __get_page_links__(self, page):
- # gets a list of all <a> tags in page
- links_tags = page.find_all("a")
- # going through each link
- for link in links_tags:
- link_href = link.get("href") # get <a> tag link reference. example: <a href="page.html"> ==> page.html
- # check that: link isn't already listed + link isn't blank
- link_listed = self.__is_link_listed__(link_href)
- if (not link_listed) and (link_href != "#"):
- # add link to list of links to open
- self.__links_to_open.append(link_href)
- # open a page and get its content
- def __open_url__(self, url):
- # get page content
- request = requests.get(url) # open given url of page
- content = BeautifulSoup(request.content, "html.parser")
- self.__get_page_links__(content) # send content to retrieve links from
- # main spider function; creates our spider's web
- def create_web(self):
- self.__open_url__(self.base_url) # send main url to be opened and checked
- # while there are still links to open
- while len(self.__links_to_open) != 0 and self.__crawled_max__():
- # start from the last link in the list
- i = len(self.__links_to_open) - 1
- link = self.__links_to_open[i]
- url = urllib.parse.urljoin(self.base_url, link) # join main url with page link
- if self.__is_url_good__(url): # is url valid and working
- self.__open_url__(url) # open page
- self.opened_links.append(url) # add link to list of opened links
- self.__links_to_open.remove(link) # remove opened link from list of links to open
- # prints all links of website
- def print_all_links(self):
- for link in self.opened_links:
- print(link)
- url = "http://google.com/"
- crawl = Spider(url)
- crawl.create_web()
- print(crawl.total_links())
- print(crawl.invalid_links())
Add Comment
Please, Sign In to add comment