View difference between Paste ID: tpmKJxxz and qjXBWFzZ
SHOW: | | - or go back to the newest paste.
1
from bs4 import BeautifulSoup
2
import sys,os
3
import httplib2, urllib,time
4
5
http = httplib2.Http()
6
7
count = int(sys.argv[1])
8
9
#The script would run forever if it wouldn't find the specified amount of images
10
#That happens if it already has all images, or if the random generator does not give a new one 
11
#So we define the maximum retries the script does before aborting
12
retries = 0
13
max_retries = int(sys.argv[2])
14
15
#Where to put the images
16
images_path = "/var/www/4chanbanners/images/"
17
18
if len(sys.argv) == 3:
19
	while count > 0 and retries <= 20:
20
		#Sleep a bit, we don't wan't to scrape to hard
21
		time.sleep(3)
22
		
23
		#Get the html of the /b/ board
24
		status, response = http.request('http://boards.4chan.org/b/')
25
		
26
		#Get the src attribute of the first img tag, which is the banner
27
		soup = BeautifulSoup(response).img["src"]
28
		
29
		#And get the filename for duplicate checking
30
		filename = soup.split('/')[-1].split('#')[0].split('?')[0]
31
		
32
		#If the file exists, try again and add +1 to out retries
33
		if os.path.exists(images_path+filename):
34
			retries+= 1
35
			print "File " + filename + " already exists!"
36
		#If the file does not exist, download it and reset the retries
37
		else:
38
			urllib.urlretrieve("http:"+soup,images_path+filename)
39
			print "Got " + filename
40
			retries = 0
41
			count -= 1
42
else:
43
	print "Usage: parse.py IMAGE_COUNT MAX_RETRIES"