Want more features on Pastebin? Sign Up, it's FREE!
Guest

Untitled

By: a guest on Aug 2nd, 2012  |  syntax: Python  |  size: 2.11 KB  |  views: 382  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. import os
  2. import sys
  3. import urllib
  4. import urllib2
  5. import re
  6. import time
  7.  
  8. if not len(sys.argv) >= 3:
  9.     print "Missing parameters."
  10.     print "Usage:    python 4chan.py <url> <folder>"
  11.     sys.exit()
  12.  
  13. threadurl = sys.argv[1]
  14. subfolder = sys.argv[2]
  15.  
  16. exp_imgurl = re.compile('4chan\.org/\w+/src/\d+\.(?:jpg|gif|png|jpeg)')
  17. exp_picname = re.compile('\d+\.(?:jpg|gif|png|jpeg)')
  18.  
  19. ua = "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US; rv:1.9.1.4) Gecko/20091007 Firefox/3.5.4"
  20. head = {'User-agent': ua}
  21.  
  22. print "Thread %s going to folder %s" % (threadurl, subfolder)
  23.  
  24. print "Fetching html..."
  25.  
  26. req = urllib2.Request(threadurl, None, head)
  27. try:
  28.     response = urllib2.urlopen(req)
  29. except urllib2.HTTPError, e:
  30.     if errorcount < 1:
  31.         errorcount = 1
  32.         print "Request failed"
  33.         response = urllib2.urlopen(req)
  34. except urllib2.URLError, e:
  35.     if errorcount < 1:
  36.         errorcount = 1
  37.         print "Request failed"
  38.         response = urllib2.urlopen(req)
  39.  
  40. msg = response.read()
  41. errorcount = 0
  42.  
  43. print "Received %d bytes" % len(msg)
  44.  
  45. imgurls = exp_imgurl.findall(msg)
  46.  
  47. print "Found %d images" % len(imgurls)
  48.  
  49. if not os.path.exists(subfolder):
  50.     print "Folder %s does not exist. Creating..." % subfolder
  51.     os.makedirs(subfolder)
  52. else:
  53.     print "Folder %s exists. I will just put all files in there." % subfolder
  54.  
  55. totalnumber = len(list(set(imgurls)))
  56.  
  57. for i, img in enumerate(list(set(imgurls))):
  58.     source = "http://images."+str(img)
  59.     filename = exp_picname.findall(source)[0]
  60.     destination = os.path.join(subfolder, filename)
  61.     if not os.path.isfile(destination):
  62.         try:
  63.             print "Downloading %d/%d: %s" % (i+1, totalnumber, source)
  64.             urllib.urlretrieve(source, destination)
  65.             time.sleep(0.25) # why?
  66.         except urllib.ContentTooShortError:
  67.             print "Image download failed, retrying..."
  68.             time.sleep(1)
  69.             urllib.urlretrieve(source, destination)
  70.             time.sleep(0.5) # why?
  71.     else:
  72.         print "File %s exists. Skipping..." % str(filename)
  73.  
  74. print "Aaaaaaand we are done. See you next time."
clone this paste RAW Paste Data