ade_talon

Caveman Web Crawler

Apr 15th, 2022 (edited)
478
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.99 KB | None | 0 0
  1. import os
  2. import requests
  3. from bs4 import BeautifulSoup
  4.  
  5.  
  6. os.system("cls||clear")
  7. folders = "/home/lltestuser/scripts/get_images/folders.txt"
  8. files = "/home/lltestuser/scripts/get_images/files.txt"
  9. temp = "/home/lltestuser/scripts/get_images/temp.tmp"
  10. error_report = "/home/lltestuser/scripts/get_images/errors.log"
  11. base_url = "http://ral-rdgbuild-03.itronhdc.com/OWI_Builds/"
  12.  
  13.  
  14.  
  15.  
  16.  
  17.  
  18.  
  19. def delete_files():
  20.     src_files = (folders, files, temp, error_report)
  21.     for src_file in src_files:
  22.         if os.path.exists(src_file):
  23.             os.remove(src_file)
  24.  
  25.  
  26.  
  27.  
  28.  
  29.  
  30.  
  31.  
  32.  
  33.  
  34.  
  35. def crawl_pages():
  36.     with open(folders, "r") as fol:
  37.         fol_lines = fol.readlines()
  38.  
  39.     for url in fol_lines:
  40.         print("---> Currently Scraping : {} <---".format(url.strip()))
  41.  
  42.         reqs = requests.get(url.strip())
  43.         soup = BeautifulSoup(reqs.text, 'html.parser')
  44.        
  45.         # put all results into the "temp" file
  46.         urls = []
  47.         tmp = open(temp, "a")
  48.         for link in soup.find_all('a'):
  49.             tmp.write("{}\n".format(os.path.join(url.replace("\n",""), link.get('href'))))
  50.         tmp.close()
  51.  
  52.     os.remove(folders)
  53.  
  54.  
  55.  
  56.  
  57.  
  58.  
  59.  
  60.  
  61.  
  62.  
  63. def move_tmp():
  64.     fol = open(folders, "w")
  65.     fil = open(files, "a")
  66.  
  67.     with open(temp, "r") as tmp:
  68.         tmp_lines = tmp.readlines()
  69.  
  70.     for line in tmp_lines:
  71.         if ("?" in line) or (len(line) < 7) or ("http://ral-rdgbuild-03.itronhdc.com" not in line):
  72.             print("Omitting line : {}".format(line.strip()))
  73.         else:
  74.             t_str = line.replace("\n","")
  75.             if t_str[-1] == "/":
  76.                 print("[FOLDER] {}".format(t_str))
  77.                 fol.write(line)
  78.             else:
  79.                 print("[FILE] {}".format(t_str))
  80.                 fil.write(line)
  81.  
  82.     fol.close()
  83.     fil.close()
  84.  
  85.     os.remove(temp)
  86.  
  87.     if os.path.exists(folders):
  88.         with open(folders, "r") as fol:
  89.             fol_len = fol.readlines()
  90.    
  91.     if len(fol_len) == 0:
  92.         #delete_files()
  93.         print("REACHED ZERO FOLDER LENGTH")
  94.         exit()
  95.  
  96.  
  97.  
  98.  
  99.  
  100.  
  101.  
  102.  
  103.  
  104.  
  105.  
  106. delete_files()
  107.  
  108. f = open(folders, "w")
  109. f.write(base_url)
  110. f.close()
  111.  
  112. while 1 == 1:
  113.     crawl_pages()
  114.     move_tmp()
  115.  
  116. print("REACHED THE END OF THE SCRIPT")
Add Comment
Please, Sign In to add comment