Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def scraper(url,downloadDirectory):
- start = time.time()
- baseUrl = r"{some_url}"
- html = urlopen(url)
- bsObj = BeautifulSoup(html.read())
- table = bsObj.findAll("table")[0]
- links = table.findAll("a")
- count = 0
- broken_links = []
- for link in links:
- try:
- count += 1
- link = str(link).split(""")
- if len(link) > 1:
- print(link)
- link = link[1]
- linkBreak = link.split("_")
- else:
- if link[0] == "<a></a>":
- print("Skipping")
- continue
- else:
- print(link)
- linkBreak = link.split("_")
- title = re.findall(r"[w']+",str(linkBreak))[9].strip("'")
- if title == "nyc":
- title = re.findall(r"[w']+",str(linkBreak))[10].strip("'")
- print("# " + str(count), "Title: " + str(title))
- dir_path = os.path.join(downloadDirectory,title)
- if os.path.isdir(dir_path) == False:
- print("Creating directory: " + str(os.path.join(downloadDirectory,title)))
- os.mkdir(dir_path)
- file_path = urllib.parse.urljoin(baseUrl,link)
- print("File Path: " + str(file_path), "n" + "Directory Path: " + str(dir_path))
- print("Split array and length: ", linkBreak, len(linkBreak))
- if len(linkBreak) == 1:
- if os.path.isfile(os.path.join(dir_path,str(linkBreak[0]).split("/")[7])):
- print("Skipping")
- continue
- else:
- print("Result: " + str(os.path.join(dir_path,str(linkBreak[0]).split("/")[7])))
- urlretrieve(file_path,os.path.join(dir_path,str(linkBreak[0]).split("/")[7]))
- elif len(linkBreak) == 2:
- if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[1])):
- print("Skipping")
- continue
- elif str(os.path.join(dir_path,title + "_" + linkBreak[1])).endswith(".zip") == False:
- if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[1] + ".zip")):
- print("Skipping")
- continue
- else:
- print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[1] + ".zip")))
- urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[1] + ".zip"))
- else:
- print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[1])))
- urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[1]))
- elif len(linkBreak) == 3:
- if "?" in linkBreak[2]:
- linkBreak[2] = linkBreak[2].split("?", 1)[0]
- if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[2])):
- print("Skipping")
- continue
- else:
- print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[2])))
- urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[2]))
- if title == "sidewalkcafe":
- linkBreak[2] = str(linkBreak[1]) + str(linkBreak[2])
- if os.path.isfile(os.path.join(dir_path,title + linkBreak[2])):
- print("Skipping")
- continue
- else:
- print("Result: " + str(os.path.join(dir_path,title + linkBreak[2])))
- urlretrieve(file_path,os.path.join(dir_path,title + linkBreak[2]))
- else:
- if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[2])):
- print("Skipping")
- continue
- else:
- print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[2])))
- urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[2]))
- elif len(linkBreak) == 4:
- if "?" in linkBreak[3]:
- linkBreak[3] = linkBreak[3].split("?",1)[0]
- linkBreak[2] = str(linkBreak[2]) + "_" + str(linkBreak[3])
- if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[2])):
- print("Skipping")
- continue
- else:
- print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[2])))
- urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[2]))
- else:
- if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[2])):
- print("Skipping")
- continue
- else:
- print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[2])))
- urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[2]))
- except HTTPError as e:
- if e.code == 404:
- print(e)
- print(count,"__________")
- broken_links.append([count,title,link])
- continue
- else:
- raise
- end = time.time()
- fp = os.path.join(downloadDirectory,"BrokenLinks.txt")
- file = open(fp,"w+")
- for link in broken_links:
- file.write(str(link) + "n")
- file.write(str(datetime.now()))
- file.close()
- return("Script completed in: " + str(end - start) + " seconds.")
- archURL = {some_url}
- archDownloadDirectory = {some_localpath}
- scraper(archURL,archDownloadDirectory)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement