Advertisement
skip420

linkgrab

Aug 27th, 2021
964
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.58 KB | None | 0 0
  1. #Have_all_Links_spit_into_,csv_file
  2. #python3 linkgrab.py  www.gab.com
  3.  
  4.  
  5. import requests
  6. from urllib.parse import urlparse, urljoin
  7. from bs4 import BeautifulSoup
  8. import colorama
  9.  
  10. # init the colorama module
  11. colorama.init()
  12.  
  13. GREEN = colorama.Fore.GREEN
  14. GRAY = colorama.Fore.LIGHTBLACK_EX
  15. RESET = colorama.Fore.RESET
  16. YELLOW = colorama.Fore.YELLOW
  17.  
  18. # initialize the set of links (unique links)
  19. internal_urls = set()
  20. external_urls = set()
  21.  
  22. total_urls_visited = 0
  23.  
  24.  
  25. def is_valid(url):
  26.     """
  27.    Checks whether `url` is a valid URL.
  28.    """
  29.     parsed = urlparse(url)
  30.     return bool(parsed.netloc) and bool(parsed.scheme)
  31.  
  32.  
  33. def get_all_website_links(url):
  34.     """
  35.    Returns all URLs that is found on `url` in which it belongs to the same website
  36.    """
  37.     # all URLs of `url`
  38.     urls = set()
  39.     # domain name of the URL without the protocol
  40.     domain_name = urlparse(url).netloc
  41.     soup = BeautifulSoup(requests.get(url).content, "html.parser")
  42.     for a_tag in soup.findAll("a"):
  43.         href = a_tag.attrs.get("href")
  44.         if href == "" or href is None:
  45.             # href empty tag
  46.             continue
  47.         # join the URL if it's relative (not absolute link)
  48.         href = urljoin(url, href)
  49.         parsed_href = urlparse(href)
  50.         # remove URL GET parameters, URL fragments, etc.
  51.         href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
  52.         if not is_valid(href):
  53.             # not a valid URL
  54.             continue
  55.         if href in internal_urls:
  56.             # already in the set
  57.             continue
  58.         if domain_name not in href:
  59.             # external link
  60.             if href not in external_urls:
  61.                 print(f"{GRAY}[!] External link: {href}{RESET}")
  62.                 external_urls.add(href)
  63.             continue
  64.         print(f"{GREEN}[*] Internal link: {href}{RESET}")
  65.         urls.add(href)
  66.         internal_urls.add(href)
  67.     return urls
  68.  
  69.  
  70. def crawl(url, max_urls=30):
  71.     """
  72.    Crawls a web page and extracts all links.
  73.    You'll find all links in `external_urls` and `internal_urls` global set variables.
  74.    params:
  75.        max_urls (int): number of max urls to crawl, default is 30.
  76.    """
  77.     global total_urls_visited
  78.     total_urls_visited += 1
  79.     print(f"{YELLOW}[*] Crawling: {url}{RESET}")
  80.     links = get_all_website_links(url)
  81.     for link in links:
  82.         if total_urls_visited > max_urls:
  83.             break
  84.         crawl(link, max_urls=max_urls)
  85.  
  86.  
  87. if __name__ == "__main__":
  88.     import argparse
  89.     parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
  90.     parser.add_argument("url", help="The URL to extract links from.")
  91.     parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)
  92.    
  93.     args = parser.parse_args()
  94.     url = args.url
  95.     max_urls = args.max_urls
  96.  
  97.     crawl(url, max_urls=max_urls)
  98.  
  99.     print("[+] Total Internal links:", len(internal_urls))
  100.     print("[+] Total External links:", len(external_urls))
  101.     print("[+] Total URLs:", len(external_urls) + len(internal_urls))
  102.     print("[+] Total crawled URLs:", max_urls)
  103.  
  104.     domain_name = urlparse(url).netloc
  105.  
  106.     # save the internal links to a file
  107.     with open(f"{domain_name}_internal_links.csv", "w") as f:
  108.         for internal_link in internal_urls:
  109.             print(internal_link.strip(), file=f)
  110.  
  111.     # save the external links to a file
  112.     with open(f"{domain_name}_external_links.csv", "w") as f:
  113.         for external_link in external_urls:
  114.             print(external_link.strip(), file=f)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement