Guest User

How to Download All PDFs on a Webpage with a Python Script

a guest
Jul 14th, 2015
1,576
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1.  
  2. import urlparse
  3. import urllib2
  4. import os
  5. import sys
  6.  
  7. try:
  8.     from bs4 import BeautifulSoup
  9. except ImportError:
  10.     print "[*] Please download and install Beautiful Soup first!"
  11.     sys.exit(0)
  12.  
  13. url = raw_input("[+] Enter the url: ")
  14. download_path = raw_input("[+] Enter the download path in full: ")
  15.  
  16. try:
  17.     #to make it look legit for the url
  18.     headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0"}
  19.  
  20.     i = 0
  21.  
  22.     request = urllib2.Request(url, None, headers)
  23.     html = urllib2.urlopen(request)
  24.     soup = BeautifulSoup(html.read()) #to parse the website
  25.  
  26.     for tag in soup.findAll('a', href=True): #find <a> tags with href in it so you know it is for urls
  27.         #so that if it doesn't contain the full url it can the url itself to it for the download
  28.         tag['href'] = urlparse.urljoin(url, tag['href'])
  29.  
  30.         #this is pretty easy we are getting the extension (splitext) from the last name of the full url(basename)
  31.         #the spiltext splits it into the filename and the extension so the [1] is for the second part(the extension)
  32.         if os.path.splitext(os.path.basename(tag['href']))[1] == '.pdf':
  33.             current = urllib2.urlopen(tag['href'])
  34.             print "\n[*] Downloading: %s" %(os.path.basename(tag['href']))
  35.  
  36.             f = open(download_path + "\\" +os.path.basename(tag['href'], "wb"))
  37.             f.write(current.read())
  38.             f.close()
  39.             i+=1
  40.  
  41.     print "\n[*] Downloaded %d files" %(i+1)
  42.     raw_input("[+] Press any key to exit...")
  43.  
  44. except KeyboardInterrupt:
  45.     print "[*] Exiting..."
  46.     sys.exit(1)
  47.  
  48. except URLError as e:
  49.     print "[*] Could not get information from server!!"
  50.     sys.exit(2)
  51.  
  52. except:
  53.     print "I don't know the problem but sorry!!"
  54.     sys.exit(3)
RAW Paste Data