skip420

linkscraper

Feb 21st, 2022 (edited)
254
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.80 KB | None | 0 0
  1. # python3 linkscraper.py
  2.  
  3. import requests
  4. from urllib.parse import urlparse, urljoin
  5. from bs4 import BeautifulSoup
  6. import colorama
  7.  
  8. # init the colorama module
  9. colorama.init()
  10. GREEN = colorama.Fore.GREEN
  11. GRAY = colorama.Fore.LIGHTBLACK_EX
  12. RESET = colorama.Fore.RESET
  13. YELLOW = colorama.Fore.YELLOW
  14.  
  15. # initialize the set of links (unique links)
  16. internal_urls = set()
  17. external_urls = set()
  18.  
  19. def is_valid(url):
  20.     """
  21.    Checks whether `url` is a valid URL.
  22.    """
  23.     parsed = urlparse(url)
  24.     return bool(parsed.netloc) and bool(parsed.scheme)
  25.  
  26. def get_all_website_links(url):
  27.     """
  28.    Returns all URLs that is found on `url` in which it belongs to the same website
  29.    """
  30.     # all URLs of `url`
  31.     urls = set()
  32.     # domain name of the URL without the protocol
  33.     domain_name = urlparse(url).netloc
  34.     soup = BeautifulSoup(requests.get(url).content, "html.parser")
  35.  
  36.     for a_tag in soup.findAll("a"):
  37.         href = a_tag.attrs.get("href")
  38.         if href == "" or href is None:
  39.             # href empty tag
  40.             continue
  41.  
  42.         # join the URL if it's relative (not absolute link)
  43.         href = urljoin(url, href)
  44.         parsed_href = urlparse(href)
  45.         # remove URL GET parameters, URL fragments, etc.
  46.         href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
  47.         if not is_valid(href):
  48.             # not a valid URL
  49.             continue
  50.         if href in internal_urls:
  51.             # already in the set
  52.             continue
  53.         if domain_name not in href:
  54.             # external link
  55.             if href not in external_urls:
  56.                 print(f"{GRAY}[!] External link: {href}{RESET}")
  57.                 external_urls.add(href)
  58.             continue
  59.         print(f"{GREEN}[*] Internal link: {href}{RESET}")
  60.         urls.add(href)
  61.         internal_urls.add(href)
  62.     return urls
  63.  
  64. # number of urls visited so far will be stored here
  65. total_urls_visited = 0
  66.  
  67. def crawl(url, max_urls=30):
  68.     """
  69.    Crawls a web page and extracts all links.
  70.    You'll find all links in `external_urls` and `internal_urls` global set variables.
  71.    params:
  72.        max_urls (int): number of max urls to crawl, default is 30.
  73.    """
  74.     global total_urls_visited
  75.     total_urls_visited += 1
  76.     print(f"{YELLOW}[*] Crawling: {url}{RESET}")
  77.     links = get_all_website_links(url)
  78.     for link in links:
  79.         if total_urls_visited > max_urls:
  80.             break
  81.         crawl(link, max_urls=max_urls)
  82.  
  83. if __name__ == "__main__":
  84.     crawl("https://myip.ms/browse/comp_ip/1-294/cityID/170#1-294")
  85.     print("[+] Total Internal links:", len(internal_urls))
  86.     print("[+] Total External links:", len(external_urls))
  87.     print("[+] Total URLs:", len(external_urls) + len(internal_urls))
  88.     print("[+] Total crawled URLs:", max_urls)
  89.  
  90.  
  91.  
Add Comment
Please, Sign In to add comment