Advertisement
Guest User

spupy

a guest
Jan 23rd, 2010
349
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.26 KB | None | 0 0
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. """
  4.    Scraper for Danbooru-like websites.
  5.    Tested sites are:
  6.        konachan.com
  7.        danbooru.donmai.us
  8. """
  9. import urllib
  10. import xml.dom.minidom
  11. import sys
  12. import time
  13. from optparse import OptionParser
  14. try:
  15.     from TaggedImage import TaggedImage
  16. except ImportError:
  17.     __has_tagger__ = False
  18. else:
  19.     __has_tagger__ = True
  20.  
  21.  
  22. WAIT_BETWEEN_PAGES = 1
  23.  
  24.  
  25. def set_title(title):
  26.     """
  27.        Sets the title of the terminal.
  28.    """
  29.     sys.stdout.write("\x1b]2;" + title + "\x07")
  30.  
  31.  
  32. def parse_xml_page(url):
  33.     """
  34.        Parses one xml pages, find all images and tags in it.
  35.    """
  36.     try:
  37.         f = urllib.urlopen(url)
  38.         data = f.read()
  39.         f.close()
  40.         doc = xml.dom.minidom.parseString(data)
  41.         urls = []
  42.         posts = doc.getElementsByTagName("post")
  43.         for post in posts:
  44.             file_url = post.getAttribute("file_url")
  45.             tags = post.getAttribute("tags")
  46.             image_tuple = (file_url, tags)
  47.             urls.append(image_tuple)
  48.         return urls
  49.     except IOError, error_code:
  50.         print error_code
  51.         return []
  52.  
  53.  
  54. def get_img_tuples(site_url, tags):
  55.     """
  56.        Downloads all pages for a tag. Parses them and returns a list of the URLs of the images. Second parameter is the number of pages.
  57.    """
  58.     print "Getting image URLs..."
  59.     set_title("Getting image URLs...")
  60.     all_images = []
  61.     page = 1
  62.     while True:
  63.         set_title("Parsing page " + str(page))
  64.         url = "http://" + site_url + "/post/index.xml?page=" + str(page) + "&tags=" + tags + "&limit=100"
  65.         print ">>>", url
  66.         page_images = parse_xml_page(url)
  67.         if len(page_images) == 0:
  68.             page -= 1
  69.             break
  70.         if page_images:
  71.             all_images.extend(page_images)
  72.         page += 1
  73.     print "Found", page, "page(s)"
  74.  
  75.     return all_images
  76.  
  77.  
  78. def create_filename(image_url, prefix):
  79.     """ Creates a sane filename for the image """
  80.     last_idx = image_url.rfind('/') + 1
  81.     filename = image_url[last_idx:]
  82.     if prefix is not None and prefix != "":
  83.         filename = prefix + "_" + filename
  84.     filename = filename.replace("%20", " ")
  85.     return filename
  86.  
  87.  
  88. def tag_image(filename, tags):
  89.     """ Put IPTC tags [Iptc.Application2.Keywords]. """
  90.     if __has_tagger__:
  91.         # Danbooru tags are space separated with _ in the tags
  92.         # We want % separated with spaces in the tags
  93.         tags = tags.replace(' ', '%')
  94.         tags = tags.replace('_', ' ')
  95.         tags = tags.split('%')
  96.         image = TaggedImage(filename)
  97.         image.add_tags(tags)
  98.         image.write_tags()
  99.  
  100.  
  101. def download_images(image_tuples, prefix):
  102.     print "Starting download..."
  103.     i = 0
  104.     l = len(image_tuples)
  105.     for tuple in image_tuples:
  106.         i += 1
  107.         url = tuple[0]
  108.         tags = tuple[1]
  109.         print ">> ", i, "/", l, " > ", url
  110.         set_title(str(i) + "/" + str(l))
  111.         filename = create_filename(url, prefix)
  112.         urllib.urlretrieve(url, filename)
  113.         tag_image(filename, tags)
  114.         time.sleep(WAIT_BETWEEN_PAGES)
  115.  
  116.  
  117. def download_tag(site_url, tags, prefix=None):
  118.     """
  119.        Downloads all pictures of a tag. Prefix is for categorizing the files.
  120.    """
  121.     print "Downloading pics for tag(s): \"" + tags + "\" from " + site_url
  122.     image_tuples = get_img_tuples(site_url, tags)
  123.     print "Retrieved ", len(image_tuples), "image URLs"
  124.     download_images(image_tuples, prefix)
  125.  
  126. if __name__ == "__main__":
  127.     usage = "usage: %prog [-s site] [-p prefix] tags"
  128.     parser = OptionParser(version="0.4", usage=usage)
  129.     parser.add_option('-s', '--site', metavar='SITE', dest='site_url', help="URL of the site from which to download [default is danbooru.donmai.us]", default="danbooru.donmai.us")
  130.     parser.add_option('-p', '--prefix', metavar='PREFIX', dest='prefix', help="Prefix to give to the downloaded files", default="")
  131.     (opts, args) = parser.parse_args()
  132.  
  133.     if len(args) == 0:
  134.         print "No tags given"
  135.         parser.print_usage()
  136.         exit(1)
  137.     site_url = opts.site_url
  138.     prefix = opts.prefix
  139.     tags = "+".join(args)
  140.     print "Site:", site_url
  141.     print "Tags:", tags
  142.     download_tag(site_url, tags, prefix=prefix)
  143.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement