Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- """
- Scraper for Danbooru-like websites.
- Tested sites are:
- konachan.com
- danbooru.donmai.us
- """
- import urllib
- import xml.dom.minidom
- import sys
- import time
- from optparse import OptionParser
- try:
- from TaggedImage import TaggedImage
- except ImportError:
- __has_tagger__ = False
- else:
- __has_tagger__ = True
- WAIT_BETWEEN_PAGES = 1
- def set_title(title):
- """
- Sets the title of the terminal.
- """
- sys.stdout.write("\x1b]2;" + title + "\x07")
- def parse_xml_page(url):
- """
- Parses one xml pages, find all images and tags in it.
- """
- try:
- f = urllib.urlopen(url)
- data = f.read()
- f.close()
- doc = xml.dom.minidom.parseString(data)
- urls = []
- posts = doc.getElementsByTagName("post")
- for post in posts:
- file_url = post.getAttribute("file_url")
- tags = post.getAttribute("tags")
- image_tuple = (file_url, tags)
- urls.append(image_tuple)
- return urls
- except IOError, error_code:
- print error_code
- return []
- def get_img_tuples(site_url, tags):
- """
- Downloads all pages for a tag. Parses them and returns a list of the URLs of the images. Second parameter is the number of pages.
- """
- print "Getting image URLs..."
- set_title("Getting image URLs...")
- all_images = []
- page = 1
- while True:
- set_title("Parsing page " + str(page))
- url = "http://" + site_url + "/post/index.xml?page=" + str(page) + "&tags=" + tags + "&limit=100"
- print ">>>", url
- page_images = parse_xml_page(url)
- if len(page_images) == 0:
- page -= 1
- break
- if page_images:
- all_images.extend(page_images)
- page += 1
- print "Found", page, "page(s)"
- return all_images
- def create_filename(image_url, prefix):
- """ Creates a sane filename for the image """
- last_idx = image_url.rfind('/') + 1
- filename = image_url[last_idx:]
- if prefix is not None and prefix != "":
- filename = prefix + "_" + filename
- filename = filename.replace("%20", " ")
- return filename
- def tag_image(filename, tags):
- """ Put IPTC tags [Iptc.Application2.Keywords]. """
- if __has_tagger__:
- # Danbooru tags are space separated with _ in the tags
- # We want % separated with spaces in the tags
- tags = tags.replace(' ', '%')
- tags = tags.replace('_', ' ')
- tags = tags.split('%')
- image = TaggedImage(filename)
- image.add_tags(tags)
- image.write_tags()
- def download_images(image_tuples, prefix):
- print "Starting download..."
- i = 0
- l = len(image_tuples)
- for tuple in image_tuples:
- i += 1
- url = tuple[0]
- tags = tuple[1]
- print ">> ", i, "/", l, " > ", url
- set_title(str(i) + "/" + str(l))
- filename = create_filename(url, prefix)
- urllib.urlretrieve(url, filename)
- tag_image(filename, tags)
- time.sleep(WAIT_BETWEEN_PAGES)
- def download_tag(site_url, tags, prefix=None):
- """
- Downloads all pictures of a tag. Prefix is for categorizing the files.
- """
- print "Downloading pics for tag(s): \"" + tags + "\" from " + site_url
- image_tuples = get_img_tuples(site_url, tags)
- print "Retrieved ", len(image_tuples), "image URLs"
- download_images(image_tuples, prefix)
- if __name__ == "__main__":
- usage = "usage: %prog [-s site] [-p prefix] tags"
- parser = OptionParser(version="0.4", usage=usage)
- parser.add_option('-s', '--site', metavar='SITE', dest='site_url', help="URL of the site from which to download [default is danbooru.donmai.us]", default="danbooru.donmai.us")
- parser.add_option('-p', '--prefix', metavar='PREFIX', dest='prefix', help="Prefix to give to the downloaded files", default="")
- (opts, args) = parser.parse_args()
- if len(args) == 0:
- print "No tags given"
- parser.print_usage()
- exit(1)
- site_url = opts.site_url
- prefix = opts.prefix
- tags = "+".join(args)
- print "Site:", site_url
- print "Tags:", tags
- download_tag(site_url, tags, prefix=prefix)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement