SHOW:
|
|
- or go back to the newest paste.
1 | - | #Que os follen a todos corruptos hijos de la gran puta |
1 | + | #Que os follen a todos hijos de la gran puta |
2 | ||
3 | from urllib.request import urlopen | |
4 | from urllib.request import HTTPError | |
5 | ||
6 | print("START!") | |
7 | ||
8 | robots_file = open("robots.txt",'r') | |
9 | ||
10 | files_missing = [] | |
11 | files_total = 0 | |
12 | ||
13 | #For every line in robot | |
14 | for line in robots_file: | |
15 | ||
16 | #If it is a PDF file | |
17 | if ".pdf" in line: | |
18 | ||
19 | #Get the URL and name | |
20 | parts = line.split() | |
21 | url = 'http://boe.es'+parts[1] | |
22 | name_tree = parts[1].split('.pdf')[0].split('/') | |
23 | name = name_tree[-1] | |
24 | ||
25 | #If is a SUM.pdf file, add also the date | |
26 | if name=="SUM": | |
27 | name = name + "_" + name_tree[3] + "_" + name_tree[4]+ "_" + name_tree[5] | |
28 | ||
29 | #Show what are you downloading | |
30 | print("downloading "+name+".pdf ... ") | |
31 | ||
32 | #Download it | |
33 | try: | |
34 | f = urlopen(url) | |
35 | data = f.read() | |
36 | with open("pdfs/"+name+".pdf", "wb") as code: | |
37 | code.write(data) | |
38 | except HTTPError: | |
39 | print("Can't download"+name+".pdf \n") | |
40 | files_missing.append(name+".pdf") | |
41 | ||
42 | except URLError: | |
43 | print("Can't download"+name+".pdf \n") | |
44 | print("Bad url "+url+" ¿? \n") | |
45 | files_missing.append(name+".pdf") | |
46 | ||
47 | else: | |
48 | print("done! \n") | |
49 | files_total = files_total + 1 | |
50 | ||
51 | ||
52 | print("Total Download = "+files_total) | |
53 | print("Files not found = "+files_missing.size()) | |
54 | print("Files missing are...") | |
55 | ||
56 | for element in files_missing: | |
57 | ||
58 | print(element) | |
59 | ||
60 | robots_file.close() |