Guest User

Untitled

a guest
Mar 17th, 2018
101
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.54 KB | None | 0 0
  1. import requests # Helps generating requests for webpages
  2. from bs4 import BeautifulSoup # Helps parsing HTML webpages
  3. import urllib.parse
  4.  
  5.  
  6. class Spider: # spider that will get links of website
  7. # called to create instance of the class
  8. def __init__(self, base_url, max_pages=0, login_url=None, user=None, password=None):
  9. self.redirect_url = ""
  10. self.base_url = base_url # main url of website
  11. self.__links_to_open = [] # list of links yet to open
  12. self.opened_links = [] # list of links opened/all links
  13. self.total = 0 # total number of links in website
  14. self.max_pages = max_pages # max pages to crawl
  15. self.invalid_links_count = 0 # number of broken links found
  16. self.invalid_links_list = [] # list of broken links found
  17. self.login_url = login_url # login page url if available
  18. self.credentials = [user, password]
  19.  
  20. # get total number of links opened so far
  21. def total_links(self):
  22. self.total = len(self.opened_links)
  23. return self.total
  24.  
  25. # check if max pages reached
  26. def __crawled_max__(self):
  27. result = (self.max_pages == 0) or (self.max_pages < self.total_links())
  28. return result
  29.  
  30. # is link already listed
  31. def __is_link_listed__(self, link):
  32. url = urllib.parse.urljoin(self.base_url, link)
  33. result = False
  34. for opened in self.opened_links:
  35. if url == opened:
  36. result = True
  37. for to_open in self.__links_to_open:
  38. if link == to_open:
  39. result = True
  40. return result
  41.  
  42. # retrieve invalid links
  43. def invalid_links(self):
  44. if self.invalid_links_count > 0:
  45. print("Found %s broken links:" % self.invalid_links_count)
  46. for link in self.invalid_links_list:
  47. print(link)
  48. else:
  49. print("No broken links found.")
  50.  
  51. # check if given url belongs to website
  52. # i.e. is in the website's domain
  53. def __in_domain__(self, url):
  54. full_length = len(self.base_url) # get length of base url
  55. url = str(url)
  56. result = str.find(url, self.base_url, 0, full_length) # find if website url exists in given url
  57. if result == 0: # result = 0 meaning url belongs to website
  58. return True
  59. else:
  60. return False
  61.  
  62. # check for url protocol
  63. def __check_protocol__(self, url):
  64. parsed = urllib.parse.urlparse(url) # parse url to get information from it
  65. protocol = str.lower(str(parsed[0])) # get url protocol
  66. if protocol == "http" or protocol == "https": # is protocol 'http' or 'https'
  67. return True
  68. else:
  69. self.invalid_links_count += 1
  70. self.invalid_links_list.append(url)
  71. return False
  72.  
  73. # check if page opened and exists
  74. def __is_response_ok__(self, url):
  75. request = requests.get(url) # open given url of page
  76. # status_code 200 means OK; no problems with page
  77. if request.status_code == 200:
  78. return True
  79. else:
  80. self.invalid_links_count += 1
  81. self.invalid_links_list.append(url)
  82. return False
  83.  
  84. def __is_html_page__(self, url):
  85. request = requests.head(url)
  86. if "text/html" in request.headers["content-type"]:
  87. return True
  88. else:
  89. self.invalid_links_count += 1
  90. self.invalid_links_list.append(url)
  91. return False
  92.  
  93. def __is_url_good__(self, url):
  94. return self.__in_domain__(url) and self.__check_protocol__(url) and self.__is_response_ok__(url) and self.__is_html_page__(url)
  95.  
  96. # get all links in a given page
  97. def __get_page_links__(self, page):
  98. # gets a list of all <a> tags in page
  99. links_tags = page.find_all("a")
  100. # going through each link
  101. for link in links_tags:
  102. link_href = link.get("href") # get <a> tag link reference. example: <a href="page.html"> ==> page.html
  103. # check that: link isn't already listed + link isn't blank
  104. link_listed = self.__is_link_listed__(link_href)
  105. if (not link_listed) and (link_href != "#"):
  106. # add link to list of links to open
  107. self.__links_to_open.append(link_href)
  108.  
  109. # open a page and get its content
  110. def __open_url__(self, url):
  111. # get page content
  112. request = requests.get(url) # open given url of page
  113. content = BeautifulSoup(request.content, "html.parser")
  114. self.__get_page_links__(content) # send content to retrieve links from
  115.  
  116. # main spider function; creates our spider's web
  117. def create_web(self):
  118. self.__open_url__(self.base_url) # send main url to be opened and checked
  119. # while there are still links to open
  120. while len(self.__links_to_open) != 0 and self.__crawled_max__():
  121. # start from the last link in the list
  122. i = len(self.__links_to_open) - 1
  123. link = self.__links_to_open[i]
  124. url = urllib.parse.urljoin(self.base_url, link) # join main url with page link
  125. if self.__is_url_good__(url): # is url valid and working
  126. self.__open_url__(url) # open page
  127. self.opened_links.append(url) # add link to list of opened links
  128. self.__links_to_open.remove(link) # remove opened link from list of links to open
  129.  
  130. # prints all links of website
  131. def print_all_links(self):
  132. for link in self.opened_links:
  133. print(link)
  134.  
  135.  
  136. url = "http://google.com/"
  137. crawl = Spider(url)
  138. crawl.create_web()
  139.  
  140. print(crawl.total_links())
  141. print(crawl.invalid_links())
Add Comment
Please, Sign In to add comment