spupy

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
    Scraper for Danbooru-like websites.
    Tested sites are:
        konachan.com
        danbooru.donmai.us
"""
import urllib
import xml.dom.minidom
import sys
import time
from optparse import OptionParser
try:
    from TaggedImage import TaggedImage
except ImportError:
    __has_tagger__ = False
else:
    __has_tagger__ = True


WAIT_BETWEEN_PAGES = 1


def set_title(title):
    """
        Sets the title of the terminal.
    """
    sys.stdout.write("\x1b]2;" + title + "\x07")


def parse_xml_page(url):
    """
        Parses one xml pages, find all images and tags in it.
    """
    try:
        f = urllib.urlopen(url)
        data = f.read()
        f.close()
        doc = xml.dom.minidom.parseString(data)
        urls = []
        posts = doc.getElementsByTagName("post")
        for post in posts:
            file_url = post.getAttribute("file_url")
            tags = post.getAttribute("tags")
            image_tuple = (file_url, tags)
            urls.append(image_tuple)
        return urls
    except IOError, error_code:
        print error_code
        return []


def get_img_tuples(site_url, tags):
    """
        Downloads all pages for a tag. Parses them and returns a list of the URLs of the images. Second parameter is the number of pages.
    """
    print "Getting image URLs..."
    set_title("Getting image URLs...")
    all_images = []
    page = 1
    while True:
        set_title("Parsing page " + str(page))
        url = "http://" + site_url + "/post/index.xml?page=" + str(page) + "&tags=" + tags + "&limit=100"
        print ">>>", url
        page_images = parse_xml_page(url)
        if len(page_images) == 0:
            page -= 1
            break
        if page_images:
            all_images.extend(page_images)
        page += 1
    print "Found", page, "page(s)"

    return all_images


def create_filename(image_url, prefix):
    """ Creates a sane filename for the image """
    last_idx = image_url.rfind('/') + 1
    filename = image_url[last_idx:]
    if prefix is not None and prefix != "":
        filename = prefix + "_" + filename
    filename = filename.replace("%20", " ")
    return filename


def tag_image(filename, tags):
    """ Put IPTC tags [Iptc.Application2.Keywords]. """
    if __has_tagger__:
        # Danbooru tags are space separated with _ in the tags
        # We want % separated with spaces in the tags
        tags = tags.replace(' ', '%')
        tags = tags.replace('_', ' ')
        tags = tags.split('%')
        image = TaggedImage(filename)
        image.add_tags(tags)
        image.write_tags()


def download_images(image_tuples, prefix):
    print "Starting download..."
    i = 0
    l = len(image_tuples)
    for tuple in image_tuples:
        i += 1
        url = tuple[0]
        tags = tuple[1]
        print ">> ", i, "/", l, " > ", url
        set_title(str(i) + "/" + str(l))
        filename = create_filename(url, prefix)
        urllib.urlretrieve(url, filename)
        tag_image(filename, tags)
        time.sleep(WAIT_BETWEEN_PAGES)


def download_tag(site_url, tags, prefix=None):
    """
        Downloads all pictures of a tag. Prefix is for categorizing the files.
    """
    print "Downloading pics for tag(s): \"" + tags + "\" from " + site_url
    image_tuples = get_img_tuples(site_url, tags)
    print "Retrieved ", len(image_tuples), "image URLs"
    download_images(image_tuples, prefix)

if __name__ == "__main__":
    usage = "usage: %prog [-s site] [-p prefix] tags"
    parser = OptionParser(version="0.4", usage=usage)
    parser.add_option('-s', '--site', metavar='SITE', dest='site_url', help="URL of the site from which to download [default is danbooru.donmai.us]", default="danbooru.donmai.us")
    parser.add_option('-p', '--prefix', metavar='PREFIX', dest='prefix', help="Prefix to give to the downloaded files", default="")
    (opts, args) = parser.parse_args()

    if len(args) == 0:
        print "No tags given"
        parser.print_usage()
        exit(1)
    site_url = opts.site_url
    prefix = opts.prefix
    tags = "+".join(args)
    print "Site:", site_url
    print "Tags:", tags
    download_tag(site_url, tags, prefix=prefix)