import requests
import tarfile
from lxml.html import parse
from sys import exit
import codecs
import os
def process_downloads():
url = "http://www.net-security.org/insecure-archive.php"
html_tree = parse(url)
pdf_links = html_tree.xpath("//h2[@class='style12']/a/@href")
print("\nNumber of Mags to be downloaded : " + str(len(pdf_links)))
dir_save_name = "netsec-mags"
dir_save_path = os.path.join(os.getcwd(), dir_save_name)
print("\nMags Will be saved at : " + dir_save_path)
if not os.path.exists(dir_save_path):
os.mkdir(dir_save_path)
os.chdir(dir_save_path)
for link in iter(pdf_links):
filename = link.rpartition('/')[2]
if os.path.exists(os.path.join(dir_save_path, filename)):
print(filename + " already exists and therefore skipping.")
else:
print("\nCurrently downloading " + filename)
req = requests.get(link)
with codecs.open(filename, "wb") as f:
f.write(req.content)
print("\nAll mags have been downloaded.")
print("\nNow adding all the files to an Archive")
archive_loc = create_targz(dir_save_path, dir_save_name + ".tar.gz")
if archive_loc is not None:
print("\nArchive created successfully. Location :- " + archive_loc)
else:
print("Unable to create .tar.gz file. Report to Developer.")
def create_targz(dirpath, archivename):
"""
Create a .tar.gz Archive of a directory
:param dirpath: Directory location to save the the book archive.
:param archivename: Name of the archive.
"""
try:
with tarfile.open(archivename, "w:gz") as tar:
tar.add(dirpath, arcname=os.path.basename(dirpath))
return os.path.join(dirpath, archivename)
except tarfile.TarError:
return None
if __name__ == "__main__":
exit(process_downloads())