Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- import sys,os
- import httplib2, urllib,time
- http = httplib2.Http()
- count = int(sys.argv[1])
- #The script would run forever if it wouldn't find the specified amount of images
- #That happens if it already has all images, or if the random generator does not give a new one
- #So we define the maximum retries the script does before aborting
- retries = 0
- max_retries = int(sys.argv[2])
- #Where to put the images
- images_path = "/var/www/4chanbanners/images/"
- if len(sys.argv) == 3:
- while count > 0 and retries <= 20:
- #Sleep a bit, we don't wan't to scrape to hard
- time.sleep(3)
- #Get the html of the /b/ board
- status, response = http.request('http://boards.4chan.org/b/')
- #Get the src attribute of the first img tag, which is the banner
- soup = BeautifulSoup(response).img["src"]
- #And get the filename for duplicate checking
- filename = soup.split('/')[-1].split('#')[0].split('?')[0]
- #If the file exists, try again and add +1 to out retries
- if os.path.exists(images_path+filename):
- retries+= 1
- print "File " + filename + " already exists!"
- #If the file does not exist, download it and reset the retries
- else:
- urllib.urlretrieve("http:"+soup,images_path+filename)
- print "Got " + filename
- retries = 0
- count -= 1
- else:
- print "Usage: parse.py IMAGE_COUNT MAX_RETRIES"
Advertisement
Add Comment
Please, Sign In to add comment