View difference between Paste ID: <a href="/tpmKJxxz">tpmKJxxz</a> and <a href="/qjXBWFzZ">qjXBWFzZ</a>

from bs4 import BeautifulSoup
1		from bs4 import BeautifulSoup
2		import sys,os
3		import httplib2, urllib,time
4
5		http = httplib2.Http()
6
7		count = int(sys.argv[1])
8
9		#The script would run forever if it wouldn't find the specified amount of images
10		#That happens if it already has all images, or if the random generator does not give a new one
11		#So we define the maximum retries the script does before aborting
12		retries = 0
13		max_retries = int(sys.argv[2])
14
15		#Where to put the images
16		images_path = "/var/www/4chanbanners/images/"
17
18		if len(sys.argv) == 3:
19		while count > 0 and retries <= 20:
20		#Sleep a bit, we don't wan't to scrape to hard
21		time.sleep(3)
22
23		#Get the html of the /b/ board
24		status, response = http.request('http://boards.4chan.org/b/')
25
26		#Get the src attribute of the first img tag, which is the banner
27		soup = BeautifulSoup(response).img["src"]
28
29		#And get the filename for duplicate checking
30		filename = soup.split('/')[-1].split('#')[0].split('?')[0]
31
32		#If the file exists, try again and add +1 to out retries
33		if os.path.exists(images_path+filename):
34		retries+= 1
35		print "File " + filename + " already exists!"
36		#If the file does not exist, download it and reset the retries
37		else:
38		urllib.urlretrieve("http:"+soup,images_path+filename)
39		print "Got " + filename
40		retries = 0
41		count -= 1
42		else:
43		print "Usage: parse.py IMAGE_COUNT MAX_RETRIES"