alboget.py

'''
This library is a collection of Python 3.71 functions to download and archive images
from tumblr.com, before it is too late.   You can grab all the images from a specific
tumblr, or archive your own Likes.  Pic files are saved with a filename formatted like
mySFWtumblr-1234567-kitty-cat-cute.jpg, where 1234567 is the post ID and kitty-cat-cute
are tags.  It may or may not work with Python versions earlier than Python 3.71.

0.  Get the latest version of Python and have some hazy understanding of how it works.
    https://www.python.org/

1.  Get the pytumblr library and install somewhere in your python path https://github.com/tumblr/pytumblr.
    You want the pytumblr directory there not the pytumblr-master.

2.  Get OAuth credentials/keys here:   https://www.tumblr.com/docs/en/api/v2
    The four keys are strings of 50 chars, consumer_key, consumer_secret, oauth_token, oauth_token_secret
    in that order.   Each tumblr you have has its own set of keys.  You do this while logged in on a browser
    to the particular tumblr in question.  This is a requirement to call the tumblr API.

    Edit this file, or preferably, create a separate file to contain your actual keys below.

3.  Save this file as alboget.py to a directory in your Python path.

4.  Start python console.  Import os and alboget.  Navigate to the directoy that you want your incoming pics
    to be saved, e.g.  >>>os.chdir('C:\\TUMBLR').   Otherwise the files will be saved to the CWD.

5.  For your tumblr at mySFWtumblr.tumblr.com, you can save your Liked pics like this

    >>>alboget.updateLikes('mySFWtumblr', 0)

    or

    >>>alboget.updateLikes('mySFWtumblr', 3560)

    where 3560 was the number of Likes you had when you last did an update.

6.  To get the post pics from your own or any other tumblr, use

    >>>alboget.picScrape('mySFWtumblr', 0, 'mySFWtumblr')

    or

    >>>alboget.picScrape('female-presenting-nipples', 0, 'mySFWtumblr')

    where 0 will get all available pics (careful) and another number will get pics back to
    the post count specified.  You have to use YOUR OWN tumblr name in the third field, which tells
    the function which keys to use.

    This backwards seeming behaviour was put in to make it easier to
    do incremental backups of both Likes and Posts, and the directories created for the saved data have
    names like female-presenting-nipples-44560, where 44560 was the most recent post count when called.

7.  Tumblr Limitations.   In practice, tumblr has changed the way the API works so that it will only
    effectively fetch the most recent ~1000 posts from one's Likes.   This is not documented.   The
    console at https://api.tumblr.com/console also is not 100% faithful to reality.  It does seem that the
    posts function still works backward into the arbitrary past.  So when calling the updateLikes function,
    you should specify the count field as a number about 1000 less than your current Likes count.  Also, the
    API allows only 1000 calls per hour, and 5000 calls per day, however each call fetches 20 posts, so you can
    get up to 20,000 pics/hour.    The pic file download from tumblrs servers is slow, and I suspect
    that they throttle this speed once they detect that a single IP is grabbing many pics per minute.  If you
    have multiple tumblrs and multiple key sets, that will help to get around this limit.

8.  Happy downloading.

'''

import os
import re
import urllib
from urllib.parse import urlparse
import requests
import pytumblr

# Edit the strings below to put in your actual keys, and tumblr names.   These are dummies.
# You can create keys for as many tumblrs as you own.
keys = {'mySFWtumblr': ['XMKyiiypJP0Kz5EkUOKVn0dmqGBBSezxSFEgJCBghRUqizGtu3',
                        'lmoBq8zLwe99YqGHn0rJYilQHiwKcPkXJQwLJxQzWiRR5zj8xV',
                        'NiPZMaurMnhAEinHEiXS7ncguRkgCb36Asu3C9IJpgX8LrZ0Dv',
                        'mu0vgW6qQEfMqS42kAthbex9Eq54sDSM6ME6YxTLMfLJPJ9brg'],
        'myPORNtumblr': ['izljviYTtjTZJc9ftvB0W1y66fVQC2O7IHuce8GPH5U4fCb4FX',
                         'YEGXrrigdSpQbwE1GrgwBWodlWJ3fifYScBVkpUSSCTwxLW6yR',
                         'kWpNZ5v60CtYbOHIE6aNUogw2UgcIJ1jnpaHn2GoQZQg1YInnK',
                         'x675q63ESyypmXJgjMIyWokvStEHriVMruQ9dIWElFcD5UDRmt']}


tumblrrootdir = os.getcwd()   #you can hard code your preferred dir here if you like
global client
false = False; true = True; null = None;

def cleanFN(filename):
    return( re.sub("[^a-zA-Z0-9-_]+", "",filename))

def initclient(tname):
    global client
    client = pytumblr.TumblrRestClient(*keys[tname])

def namesandURLsFromPosts(postlist):
    namesURLs = []
    for i in range(len(postlist)):
        apost = postlist[i]
        blogname = apost['blog_name']
        postid = str(apost['id'])
        filepre = blogname + '-' + postid
        if apost['type'] ==  'photo':
            atag = '-'.join(apost['tags']).replace(' ', '_')
            atag = atag[0:50]
            if len(atag) > 0:
                atag = '-' + atag
            if len(apost['photos']) == 1:
                urlfetch = apost['photos'][0]['original_size']['url']
                aurl = urlparse(urlfetch)
                abase, aext = os.path.splitext(aurl.path)  #if url has no file extension this might not work
                pic = cleanFN(filepre + atag) + aext
                namesURLs.append([urlfetch, pic])
            else:
                for j in range(len(apost['photos'])):
                    urlfetch = apost['photos'][j]['original_size']['url']
                    aurl = urlparse(urlfetch)
                    abase, aext = os.path.splitext(aurl.path)
                    pic = cleanFN(filepre + atag + '-' + str(j+1).zfill(2)) + aext
                    namesURLs.append([urlfetch, pic])
    return(namesURLs)

def getPosts(tumblrname, mylimit, myoffset):
    global client
    tumresp = client.posts(tumblrname + '.tumblr.com',limit=mylimit, offset=myoffset)
    if ('posts' in tumresp):
        out = namesandURLsFromPosts(tumresp['posts'])
    else:
        print('tumblr call fail at getPosts(' + tumblrname + 'myoffset =' + str(myoffset))
        print(tumresp)
        out = []
    return(out)

def getSelfLiked(mylimit, myoffset):
    global client
    tumresp = client.likes(limit=mylimit, offset=myoffset)
    if ('liked_posts' in tumresp):
        out = namesandURLsFromPosts(tumresp['liked_posts'])
        if out == []:
            print('empty list returned from getSelfLiked at (mylimit, myoffset) ' + str(mylimit) + ' '+ str(myoffset))
    else:
        print('tumblr call fail at getSelfLiked at myoffset = ' + str(myoffset))
        print(tumresp)
        out = []
    return(out)

def getNUSincePrevious(tumblrname, previousCount):   # if previouscount = 0 get them all
    global client
    currentPostCount = client.blog_info(tumblrname + '.tumblr.com')['blog']['posts']
    print('For ' + tumblrname + ' current post count = ' + str(currentPostCount))
    offs = 0
    numtoget = currentPostCount - previousCount
    nulist = []
    for offs in range(0,numtoget, 20):
        nulist.extend(getPosts(tumblrname, 20, offs))
    return(nulist)

def getLikedNUSincePrevious(previousCount):   # if previouscount = 0 get them all
    global client
    cinfo = client.info()
    currentLikedCount = cinfo['user']['likes']
    print('For ' + cinfo['user']['name'] + ' current liked count = ' + str(currentLikedCount))
    offs = 0
    numtoget = currentLikedCount - previousCount
    nulist = []
    for offs in range(0,numtoget, 20):
        nulist.extend(getSelfLiked(20, offs))
    return(nulist)

def fetchPicsToCWD(NUlist):
    badcalls = []
    for q in range(len(NUlist)):
        a = NUlist[q]
        try:
            r = requests.get(a[0],timeout=5.0)
        except requests.exceptions.Timeout:
            badcalls.append(a)
            print('requests.exceptions.Timeout at getting '+ a[0])
        except Exception as e:
            print('Caught an exception as some sub call of fetchPicsToCWD ')
            print(e)
        else:
            if r.status_code != requests.codes.ok:
                badcalls.append(a)
                print('Bad call made with r.status_code = ' + str(r.status_code) + '\n     while getting '+ a[0])
            else:
                #need to check if the filename a[1] already exists!!!
                with open(a[1], 'wb') as f:
                      f.write(r.content)
    print(str(len(badcalls)) + ' bad calls made in this pass of pic gets')
    return(badcalls)

def updateLikes(mytumblrname, previous):
    if mytumblrname in keys:
        global client
        initclient(mytumblrname)
        cinfo = client.info()
        currentLikedCount = cinfo['user']['likes']
        os.chdir(tumblrrootdir)
        newLikesDir =mytumblrname + '-LIKES-' + str(currentLikedCount)
        try:
            os.mkdir(newLikesDir)
        except FileExistsError:
            print('directory already exists, and it is fine')
        os.chdir(newLikesDir)
        print('starting to get ' + mytumblrname + ' photolinks likes ' + str(currentLikedCount) + ' down to ' + str(previous))
        nu = getLikedNUSincePrevious(previous)
        print('starting to download '+ str(len(nu)) + ' photo files to ' + newLikesDir)
        bc = fetchPicsToCWD(nu)
        if len(bc) > 0:
            print('Starting second pass on bad calls')
            bc2 = fetchPicsToCWD(bc)
        print('updateLikes completed.')
        os.chdir(tumblrrootdir)
    else:
        print('No keys in tumblrkeys.py available for ', mytumblrname)

def picScrape(tumblrname, previous, myKeyName):   #myKeyName is your blog credentials to use
    if myKeyName in keys:
        global client
        initclient(myKeyName)
        currentPostCount = client.blog_info(tumblrname + '.tumblr.com')['blog']['posts']
        os.chdir(tumblrrootdir)
        newPostsDir =tumblrname + '-' + str(currentPostCount)
        try:
            os.mkdir(newPostsDir)
        except FileExistsError:
            print('directory already exists, and it is fine')
        os.chdir(newPostsDir)
        print('starting to get ' + tumblrname + ' photolinks ' + str(currentPostCount) + ' down to ' + str(previous))
        nu = getNUSincePrevious(tumblrname, previous)
        print('starting to download '+ str(len(nu)) + ' photo files to ' + newPostsDir)
        bc = fetchPicsToCWD(nu)
        if len(bc) > 0:
            print('Starting second pass on bad calls')
            bc2 = fetchPicsToCWD(bc)
        print('picScrape completed.')
        os.chdir(tumblrrootdir)
    else:
        print('No keys in tumblrkeys.py available for ', myKeyName)