Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Que os follen a todos hijos de la gran puta
- from urllib.request import urlopen
- from urllib.request import HTTPError
- print("START!")
- robots_file = open("robots.txt",'r')
- files_missing = []
- files_total = 0
- #For every line in robot
- for line in robots_file:
- #If it is a PDF file
- if ".pdf" in line:
- #Get the URL and name
- parts = line.split()
- url = 'http://boe.es'+parts[1]
- name_tree = parts[1].split('.pdf')[0].split('/')
- name = name_tree[-1]
- #If is a SUM.pdf file, add also the date
- if name=="SUM":
- name = name + "_" + name_tree[3] + "_" + name_tree[4]+ "_" + name_tree[5]
- #Show what are you downloading
- print("downloading "+name+".pdf ... ")
- #Download it
- try:
- f = urlopen(url)
- data = f.read()
- with open("pdfs/"+name+".pdf", "wb") as code:
- code.write(data)
- except HTTPError:
- print("Can't download"+name+".pdf \n")
- files_missing.append(name+".pdf")
- except URLError:
- print("Can't download"+name+".pdf \n")
- print("Bad url "+url+" ¿? \n")
- files_missing.append(name+".pdf")
- else:
- print("done! \n")
- files_total = files_total + 1
- print("Total Download = "+files_total)
- print("Files not found = "+files_missing.size())
- print("Files missing are...")
- for element in files_missing:
- print(element)
- robots_file.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement