Advertisement
Guest User

Robots BOE extractor

a guest
Jul 1st, 2014
264
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.38 KB | None | 0 0
  1. #Que os follen a todos corruptos hijos de la gran puta
  2.  
  3. from urllib.request import urlopen
  4. from urllib.request import HTTPError
  5.  
  6. print("START!")
  7.  
  8. robots_file = open("robots.txt",'r')
  9.  
  10. files_missing = []
  11. files_total = 0
  12.  
  13. #For every line in robot
  14. for line in robots_file:
  15.  
  16.     #If it is a PDF file
  17.     if ".pdf" in line:
  18.  
  19.         #Get the URL and name
  20.         parts = line.split()
  21.         url = 'http://boe.es'+parts[1]
  22.         name_tree = parts[1].split('.pdf')[0].split('/')
  23.         name = name_tree[-1]
  24.  
  25.         #If is a SUM.pdf file, add also the date
  26.         if name=="SUM":
  27.             name = name + "_" + name_tree[3] + "_" + name_tree[4]+ "_" + name_tree[5]
  28.        
  29.         #Show what are you downloading
  30.         print("downloading "+name+".pdf ... ")
  31.  
  32.         #Download it
  33.         try:
  34.             f = urlopen(url)    
  35.             data = f.read()
  36.             with open("pdfs/"+name+".pdf", "wb") as code:
  37.                 code.write(data)
  38.         except HTTPError:
  39.             print("Can't download"+name+".pdf \n")
  40.             files_missing.append(name+".pdf")
  41.         else:
  42.             print("done! \n")
  43.             files_total = files_total + 1
  44.        
  45.  
  46. print("Total Download = "+files_total)
  47. print("Files not found = "+files_missing.size())
  48. print("Files missing are...")
  49.  
  50. for element in files_missing:
  51.  
  52.     print(element)
  53.  
  54. robots_file.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement