Guest
Public paste!

Parker

By: a guest | Nov 28th, 2009 | Syntax: None | Size: 2.52 KB | Hits: 493 | Expires: Never
Copy text to clipboard
  1. # gallery2 scraper
  2. # by D. Parker Phinney
  3. # http://madebyparker.com
  4. # i@madebyparker.com
  5. # no rights reserved.  sharing is caring.
  6.  
  7. import os
  8. import urllib2
  9. import urllib
  10. import BeautifulSoup
  11. import re
  12.  
  13. BASE_URL='http://madebyparker.com/gallery/'
  14. #where we'll dump our scrape
  15. #should be relative to the working directory
  16. #no trailing slash
  17. SCRAPE_DEST='images'
  18.  
  19. def mkdir(dirname):
  20.         if not os.path.isdir("./" + dirname + "/"):
  21.                 os.mkdir("./" + dirname + "/")
  22.  
  23.  
  24. def gallery_importantHTMLsoup(url):
  25.         fd = urllib2.urlopen(url)
  26.         response = fd.read()
  27.         soup = BeautifulSoup.BeautifulSoup(response)
  28.         #grab everything in the sort of main part of the page
  29.         #gallery_contents = soup('div', "gsContentAlbum")[0]
  30.         gallery_contents = soup('div', id="gsContent")[0]
  31.         return gallery_contents
  32.  
  33. def image_importantHTMLsoup(url):
  34.         fd = urllib2.urlopen(url)
  35.         response = fd.read()
  36.         soup = BeautifulSoup.BeautifulSoup(response)
  37.         #grab everything in the sort of main part of the page
  38.         gallery_contents = soup('div', "gsContentPhoto")[0]
  39.         return gallery_contents
  40.  
  41.  
  42.  
  43. def scrape_gallery():
  44.         mkdir(SCRAPE_DEST)
  45.         #splash page with list of galleries
  46.         gallery_contents = gallery_importantHTMLsoup(BASE_URL + 'main.php')
  47.         gallery_thumbs = gallery_contents('img', "ImageFrame_none giThumbnail")
  48.         print gallery_thumbs
  49.         i = 1;
  50.         #descending into each gallery...
  51.         for outerThumbImg in gallery_thumbs:
  52.                 innerGalleryURL = outerThumbImg.parent["href"]
  53.                 innerGalleryName = outerThumbImg.parent.parent.parent('p', "giTitle")[0].string
  54.                 mkdir(SCRAPE_DEST + '/' + innerGalleryName)
  55.                 there_is_another_page = 1
  56.                 #iterating through pages...
  57.                 while there_is_another_page:
  58.                         innerGallerySoup = gallery_importantHTMLsoup(BASE_URL + innerGalleryURL)
  59.                         innerGallery_thumbs = innerGallerySoup('img', "ImageFrame_none giThumbnail")
  60.                        
  61.                         #going into each image's page...
  62.                         for innerThumbImg in innerGallery_thumbs:
  63.                                 ImgPageSoup = image_importantHTMLsoup(BASE_URL + innerThumbImg.parent["href"])
  64.                                 #grab the url for the full size image
  65.                                 theImgURL = ImgPageSoup('a', title="Full Size")[0]['href']
  66.                                 theImgDesc = ImgPageSoup('img', "ImageFrame_none")[0]['alt']
  67.  
  68.                                 print theImgDesc
  69.                                 urllib.urlretrieve(BASE_URL + theImgURL, SCRAPE_DEST + '/' + innerGalleryName + '/' + theImgDesc)
  70.                                 print "done"
  71.                                 i+=1
  72.                         nextLinks = innerGallerySoup('a', title="Next")
  73.                         if (len(nextLinks) >= 1):
  74.                                 innerGalleryURL = nextLinks[0]['href']
  75.                         else:
  76.                                 there_is_another_page = 0
  77.         return
  78.  
  79. if __name__ == '__main__':
  80.     scrape_gallery()