Advertisement
skip420

link_Scraper

Sep 18th, 2021
991
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.56 KB | None | 0 0
  1. # python3 linkscrape.py -m 100  "https://insaneseeds.com"
  2.  
  3. import requests
  4. from urllib.parse import urlparse, urljoin
  5. from bs4 import BeautifulSoup
  6. import colorama
  7.  
  8. # init the colorama module
  9. colorama.init()
  10.  
  11. GREEN = colorama.Fore.GREEN
  12. GRAY = colorama.Fore.LIGHTBLACK_EX
  13. RESET = colorama.Fore.RESET
  14. YELLOW = colorama.Fore.YELLOW
  15.  
  16. # initialize the set of links (unique links)
  17. internal_urls = set()
  18. external_urls = set()
  19.  
  20. total_urls_visited = 0
  21.  
  22.  
  23. def is_valid(url):
  24.     """
  25.    Checks whether `url` is a valid URL.
  26.    """
  27.     parsed = urlparse(url)
  28.     return bool(parsed.netloc) and bool(parsed.scheme)
  29.  
  30.  
  31. def get_all_website_links(url):
  32.     """
  33.    Returns all URLs that is found on `url` in which it belongs to the same website
  34.    """
  35.     # all URLs of `url`
  36.     urls = set()
  37.     # domain name of the URL without the protocol
  38.     domain_name = urlparse(url).netloc
  39.     soup = BeautifulSoup(requests.get(url).content, "html.parser")
  40.     for a_tag in soup.findAll("a"):
  41.         href = a_tag.attrs.get("href")
  42.         if href == "" or href is None:
  43.             # href empty tag
  44.             continue
  45.         # join the URL if it's relative (not absolute link)
  46.         href = urljoin(url, href)
  47.         parsed_href = urlparse(href)
  48.         # remove URL GET parameters, URL fragments, etc.
  49.         href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
  50.         if not is_valid(href):
  51.             # not a valid URL
  52.             continue
  53.         if href in internal_urls:
  54.             # already in the set
  55.             continue
  56.         if domain_name not in href:
  57.             # external link
  58.             if href not in external_urls:
  59.                 print(f"{GRAY}[!] External link: {href}{RESET}")
  60.                 external_urls.add(href)
  61.             continue
  62.         print(f"{GREEN}[*] Internal link: {href}{RESET}")
  63.         urls.add(href)
  64.         internal_urls.add(href)
  65.     return urls
  66.  
  67.  
  68. def crawl(url, max_urls=100):
  69.     """
  70.    Crawls a web page and extracts all links.
  71.    You'll find all links in `external_urls` and `internal_urls` global set variables.
  72.    params:
  73.        max_urls (int): number of max urls to crawl, default is 30.
  74.    """
  75.     global total_urls_visited
  76.     total_urls_visited += 1
  77.     print(f"{YELLOW}[*] Crawling: {url}{RESET}")
  78.     links = get_all_website_links(url)
  79.     for link in links:
  80.         if total_urls_visited > max_urls:
  81.             break
  82.         crawl(link, max_urls=max_urls)
  83.  
  84.  
  85. if __name__ == "__main__":
  86.     import argparse
  87.     parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
  88.     parser.add_argument("url", help="The URL to extract links from.")
  89.     parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 100.", default=100, type=int)
  90.    
  91.     args = parser.parse_args()
  92.     url = args.url
  93.     max_urls = args.max_urls
  94.  
  95.     crawl(url, max_urls=max_urls)
  96.  
  97.     print("[+] Total Internal links:", len(internal_urls))
  98.     print("[+] Total External links:", len(external_urls))
  99.     print("[+] Total URLs:", len(external_urls) + len(internal_urls))
  100.     print("[+] Total crawled URLs:", max_urls)
  101.  
  102.     domain_name = urlparse(url).netloc
  103.  
  104.     # save the internal links to a file
  105.     with open(f"{domain_name}_internal_links.txt", "w") as f:
  106.         for internal_link in internal_urls:
  107.             print(internal_link.strip(), file=f)
  108.  
  109.     # save the external links to a file
  110.     with open(f"{domain_name}_external_links.txt", "w") as f:
  111.         for external_link in external_urls:
  112.             print(external_link.strip(), file=f)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement