Guest User

Untitled

a guest
Aug 26th, 2012
147
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from bs4 import BeautifulSoup
  2. import sys,os
  3. import httplib2, urllib,time
  4.  
  5. http = httplib2.Http()
  6.  
  7. count = int(sys.argv[1])
  8.  
  9. #The script would run forever if it wouldn't find the specified amount of images
  10. #That happens if it already has all images, or if the random generator does not give a new one
  11. #So we define the maximum retries the script does before aborting
  12. retries = 0
  13. max_retries = int(sys.argv[2])
  14.  
  15. #Where to put the images
  16. images_path = "/var/www/4chanbanners/images/"
  17.  
  18. if len(sys.argv) == 3:
  19.     while count > 0 and retries <= 20:
  20.         #Sleep a bit, we don't wan't to scrape to hard
  21.         time.sleep(3)
  22.        
  23.         #Get the html of the /b/ board
  24.         status, response = http.request('http://boards.4chan.org/b/')
  25.        
  26.         #Get the src attribute of the first img tag, which is the banner
  27.         soup = BeautifulSoup(response).img["src"]
  28.        
  29.         #And get the filename for duplicate checking
  30.         filename = soup.split('/')[-1].split('#')[0].split('?')[0]
  31.        
  32.         #If the file exists, try again and add +1 to out retries
  33.         if os.path.exists(images_path+filename):
  34.             retries+= 1
  35.             print "File " + filename + " already exists!"
  36.         #If the file does not exist, download it and reset the retries
  37.         else:
  38.             urllib.urlretrieve("http:"+soup,images_path+filename)
  39.             print "Got " + filename
  40.             retries = 0
  41.             count -= 1
  42. else:
  43.     print "Usage: parse.py IMAGE_COUNT MAX_RETRIES"
Advertisement
Add Comment
Please, Sign In to add comment