gettum.py

import re
import json
import shutil
import os
from requests import get
from bs4 import BeautifulSoup
from datetime import date as dt

def check_date(date):

    if type(date) is dt:
        return date
    else:
        year = int(str(date)[:4])
        month = int(str(date)[4:])
        return dt(year, month, 1)


def month_list(date1, date2):

    d1, d2 = check_date(date1), check_date(date2)

    if d1 > d2:
        d2, d1 = d1, d2

    months = [d1]
    while months[-1] < d2:
        last = months[-1]
        if last.month == 12:
            months.append(dt(last.year + 1, 1, 1))
        else:
            months.append(dt(last.year, last.month + 1, 1))

    return months


def get_posts(blog, archive_date):

    date = check_date(archive_date)
    response = get("http://%s.tumblr.com/archive/%s/%s" % (blog, date.year, date.month))
    soup = BeautifulSoup(response.text)
    return [int(json.loads(tag['data-json'])['post-id'])
                for tag in soup.select("div.post.is_photo")]


def get_images(blog, post_id):

    post_url = "http://%s.tumblr.com/post/%s" % (blog, post_id)
    image_url_re = "(http://\d+\.media\.tumblr\.com/\w+/tumblr_\w+_)\d+\.(jpg|gif|png)"
    image_sizes = (1280, 500, 400, 250, 100)

    image_urls = re.findall(image_url_re, get(post_url).text)

    if not os.path.exists(blog):
        os.makedirs(blog)

    downloaded_images = []
    for image in image_urls:
        for size in image_sizes:
            resp = get("%s%s.%s" % (image[0], size, image[1]), stream = True)
            if resp.status_code == 200:
                file_name = re.search("(tumblr_\w+\.\w+)", resp.url).group(1)
                with open(os.path.join(blog, file_name), 'wb') as f:
                    for chunk in resp.iter_content(30000):
                        f.write(chunk)
                downloaded_images.append(resp.url)
                break

    return downloaded_images


def download(blog, date1, date2, progress = False):

    downloaded_images = []
    months = month_list(date1, date2)
    for date in months:
        if progress: print date.year, date.month
        posts = get_posts(blog, date)
        for post in posts:
            if progress: print "    " + str(post)
            downloaded_images += get_images(blog, post)

    return downloaded_images