Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import requests
- from bs4 import BeautifulSoup
- os.system("cls||clear")
- folders = "/home/lltestuser/scripts/get_images/folders.txt"
- files = "/home/lltestuser/scripts/get_images/files.txt"
- temp = "/home/lltestuser/scripts/get_images/temp.tmp"
- error_report = "/home/lltestuser/scripts/get_images/errors.log"
- base_url = "http://ral-rdgbuild-03.itronhdc.com/OWI_Builds/"
- def delete_files():
- src_files = (folders, files, temp, error_report)
- for src_file in src_files:
- if os.path.exists(src_file):
- os.remove(src_file)
- def crawl_pages():
- with open(folders, "r") as fol:
- fol_lines = fol.readlines()
- for url in fol_lines:
- print("---> Currently Scraping : {} <---".format(url.strip()))
- reqs = requests.get(url.strip())
- soup = BeautifulSoup(reqs.text, 'html.parser')
- # put all results into the "temp" file
- urls = []
- tmp = open(temp, "a")
- for link in soup.find_all('a'):
- tmp.write("{}\n".format(os.path.join(url.replace("\n",""), link.get('href'))))
- tmp.close()
- os.remove(folders)
- def move_tmp():
- fol = open(folders, "w")
- fil = open(files, "a")
- with open(temp, "r") as tmp:
- tmp_lines = tmp.readlines()
- for line in tmp_lines:
- if ("?" in line) or (len(line) < 7) or ("http://ral-rdgbuild-03.itronhdc.com" not in line):
- print("Omitting line : {}".format(line.strip()))
- else:
- t_str = line.replace("\n","")
- if t_str[-1] == "/":
- print("[FOLDER] {}".format(t_str))
- fol.write(line)
- else:
- print("[FILE] {}".format(t_str))
- fil.write(line)
- fol.close()
- fil.close()
- os.remove(temp)
- if os.path.exists(folders):
- with open(folders, "r") as fol:
- fol_len = fol.readlines()
- if len(fol_len) == 0:
- #delete_files()
- print("REACHED ZERO FOLDER LENGTH")
- exit()
- delete_files()
- f = open(folders, "w")
- f.write(base_url)
- f.close()
- while 1 == 1:
- crawl_pages()
- move_tmp()
- print("REACHED THE END OF THE SCRIPT")
Add Comment
Please, Sign In to add comment