SHOW:
|
|
- or go back to the newest paste.
1 | from bs4 import BeautifulSoup | |
2 | import sys,os | |
3 | import httplib2, urllib,time | |
4 | ||
5 | http = httplib2.Http() | |
6 | ||
7 | count = int(sys.argv[1]) | |
8 | ||
9 | #The script would run forever if it wouldn't find the specified amount of images | |
10 | #That happens if it already has all images, or if the random generator does not give a new one | |
11 | #So we define the maximum retries the script does before aborting | |
12 | retries = 0 | |
13 | max_retries = int(sys.argv[2]) | |
14 | ||
15 | #Where to put the images | |
16 | images_path = "/var/www/4chanbanners/images/" | |
17 | ||
18 | if len(sys.argv) == 3: | |
19 | while count > 0 and retries <= 20: | |
20 | #Sleep a bit, we don't wan't to scrape to hard | |
21 | time.sleep(3) | |
22 | ||
23 | #Get the html of the /b/ board | |
24 | status, response = http.request('http://boards.4chan.org/b/') | |
25 | ||
26 | #Get the src attribute of the first img tag, which is the banner | |
27 | soup = BeautifulSoup(response).img["src"] | |
28 | ||
29 | #And get the filename for duplicate checking | |
30 | filename = soup.split('/')[-1].split('#')[0].split('?')[0] | |
31 | ||
32 | #If the file exists, try again and add +1 to out retries | |
33 | if os.path.exists(images_path+filename): | |
34 | retries+= 1 | |
35 | print "File " + filename + " already exists!" | |
36 | #If the file does not exist, download it and reset the retries | |
37 | else: | |
38 | urllib.urlretrieve("http:"+soup,images_path+filename) | |
39 | print "Got " + filename | |
40 | retries = 0 | |
41 | count -= 1 | |
42 | else: | |
43 | print "Usage: parse.py IMAGE_COUNT MAX_RETRIES" |