Untitled

from bs4 import BeautifulSoup
import sys,os
import httplib2, urllib,time

http = httplib2.Http()

count = int(sys.argv[1])

#The script would run forever if it wouldn't find the specified amount of images
#That happens if it already has all images, or if the random generator does not give a new one
#So we define the maximum retries the script does before aborting
retries = 0
max_retries = int(sys.argv[2])

#Where to put the images
images_path = "/var/www/4chanbanners/images/"

if len(sys.argv) == 3:
    while count > 0 and retries <= 20:
        #Sleep a bit, we don't wan't to scrape to hard
        time.sleep(3)

        #Get the html of the /b/ board
        status, response = http.request('http://boards.4chan.org/b/')

        #Get the src attribute of the first img tag, which is the banner
        soup = BeautifulSoup(response).img["src"]

        #And get the filename for duplicate checking
        filename = soup.split('/')[-1].split('#')[0].split('?')[0]

        #If the file exists, try again and add +1 to out retries
        if os.path.exists(images_path+filename):
            retries+= 1
            print "File " + filename + " already exists!"
        #If the file does not exist, download it and reset the retries
        else:
            urllib.urlretrieve("http:"+soup,images_path+filename)
            print "Got " + filename
            retries = 0
            count -= 1
else:
    print "Usage: parse.py IMAGE_COUNT MAX_RETRIES"