Recent Posts
None | 8 sec ago
None | 25 sec ago
None | 25 sec ago
None | 42 sec ago
None | 52 sec ago
None | 53 sec ago
Perl | 55 sec ago
HTML | 58 sec ago
C++ | 1 min ago
None | 1 min ago
Sitereport
Find cool info about any domain on the internet?
visit sitereport
Free Subdomains
Want a pastebin.com sub-domain for your community?
learn more...
What is pastebin?
Pastebin is a website that hosts all your text & code on dedicated servers for easy sharing.
learn more...
Learn a little bit about the new Pastebin.com on our help page. hide message
By Parker on the 28th of Nov 2009 09:50:38 PM Download | Raw | Embed | Report
  1. # gallery2 scraper
  2. # by D. Parker Phinney
  3. # http://madebyparker.com
  4. # i@madebyparker.com
  5. # no rights reserved.  sharing is caring.
  6.  
  7. import os
  8. import urllib2
  9. import urllib
  10. import BeautifulSoup
  11. import re
  12.  
  13. BASE_URL='http://madebyparker.com/gallery/'
  14. #where we'll dump our scrape
  15. #should be relative to the working directory
  16. #no trailing slash
  17. SCRAPE_DEST='images'
  18.  
  19. def mkdir(dirname):
  20.         if not os.path.isdir("./" + dirname + "/"):
  21.                 os.mkdir("./" + dirname + "/")
  22.  
  23.  
  24. def gallery_importantHTMLsoup(url):
  25.         fd = urllib2.urlopen(url)
  26.         response = fd.read()
  27.         soup = BeautifulSoup.BeautifulSoup(response)
  28.         #grab everything in the sort of main part of the page
  29.         #gallery_contents = soup('div', "gsContentAlbum")[0]
  30.         gallery_contents = soup('div', id="gsContent")[0]
  31.         return gallery_contents
  32.  
  33. def image_importantHTMLsoup(url):
  34.         fd = urllib2.urlopen(url)
  35.         response = fd.read()
  36.         soup = BeautifulSoup.BeautifulSoup(response)
  37.         #grab everything in the sort of main part of the page
  38.         gallery_contents = soup('div', "gsContentPhoto")[0]
  39.         return gallery_contents
  40.  
  41.  
  42.  
  43. def scrape_gallery():
  44.         mkdir(SCRAPE_DEST)
  45.         #splash page with list of galleries
  46.         gallery_contents = gallery_importantHTMLsoup(BASE_URL + 'main.php')
  47.         gallery_thumbs = gallery_contents('img', "ImageFrame_none giThumbnail")
  48.         print gallery_thumbs
  49.         i = 1;
  50.         #descending into each gallery...
  51.         for outerThumbImg in gallery_thumbs:
  52.                 innerGalleryURL = outerThumbImg.parent["href"]
  53.                 innerGalleryName = outerThumbImg.parent.parent.parent('p', "giTitle")[0].string
  54.                 mkdir(SCRAPE_DEST + '/' + innerGalleryName)
  55.                 there_is_another_page = 1
  56.                 #iterating through pages...
  57.                 while there_is_another_page:
  58.                         innerGallerySoup = gallery_importantHTMLsoup(BASE_URL + innerGalleryURL)
  59.                         innerGallery_thumbs = innerGallerySoup('img', "ImageFrame_none giThumbnail")
  60.                        
  61.                         #going into each image's page...
  62.                         for innerThumbImg in innerGallery_thumbs:
  63.                                 ImgPageSoup = image_importantHTMLsoup(BASE_URL + innerThumbImg.parent["href"])
  64.                                 #grab the url for the full size image
  65.                                 theImgURL = ImgPageSoup('a', title="Full Size")[0]['href']
  66.                                 theImgDesc = ImgPageSoup('img', "ImageFrame_none")[0]['alt']
  67.  
  68.                                 print theImgDesc
  69.                                 urllib.urlretrieve(BASE_URL + theImgURL, SCRAPE_DEST + '/' + innerGalleryName + '/' + theImgDesc)
  70.                                 print "done"
  71.                                 i+=1
  72.                         nextLinks = innerGallerySoup('a', title="Next")
  73.                         if (len(nextLinks) >= 1):
  74.                                 innerGalleryURL = nextLinks[0]['href']
  75.                         else:
  76.                                 there_is_another_page = 0
  77.         return
  78.  
  79. if __name__ == '__main__':
  80.     scrape_gallery()
Submit a correction or amendment below. Make A New Post
To highlight particular lines, prefix each line with @h@
Syntax highlighting:
Post expiration:
Post exposure:
Name / Title:
Email: