Feedly dumper for tumblr blogs

#!/usr/bin/env python3
# encoding: utf-8

from urllib.request import urlopen
from urllib.parse import urlparse, urlencode
from http.client import HTTPException
from shutil import copyfileobj
from datetime import datetime

import logging
import json
import sys
import re
import os

BASE_URL = "https://feedly.com/v3/streams/contents"
TUMBLR_ID = re.compile(r"\/(\d+)(?:\/|\Z)")
IMG_SRC = re.compile(r"<img\s+src\=\"([^\"]*)\"\s*\/?>")
RFC3339_STRING = "%Y-%m-%dT%H:%M:%S.%fZ"
MAX_ENTRIES = 999

def run(feed_url):
    count = MAX_ENTRIES
    last_id = ""
    all_posts = []

    while count >= MAX_ENTRIES:
        items = fetch_page(last_id)
        count = len(items)

        if count:
            last_id = items[-1]["id"]
            all_posts.extend(items)

    try:
        root_path = os.path.join(os.getcwd(), sanitize_uri(feed_url))
        file_path = os.path.join(root_path, "posts.json")
        make_dir(root_path, True)

        with open(file_path, mode='w', encoding="UTF-8") as out_file:
            json.dump(all_posts, out_file, ensure_ascii=True, indent=2, sort_keys=True)
    except OSError as err:
        log.error("Error writing blog info: {err}".format(err = err))

def make_dir(dir_path, recursive=False):
    if os.path.exists(dir_path):
        if os.path.isdir(dir_path):
            return
        else:
            os.remove(dir_path)

    try:
        if recursive:
            os.makedirs(dir_path)
        else:
            os.mkdir(dir_path)
    except OSError as err:
        if err.errno != errno.EEXIST:
            raise

def get_logger(name, level=logging.INFO):
    log_format = logging.Formatter('[%(asctime)s] (%(name)s) %(levelname)s: %(message)s')

    std_output = logging.StreamHandler(sys.stdout)
    std_output.setFormatter(log_format)
    std_output.setLevel(level)

    logger = logging.getLogger(name)
    logger.setLevel(logging.DEBUG)
    logger.addHandler(std_output)
    return logger

def fetch_page(last_id = ""):
    params = {
        "streamId": "feed/{uri}".format(uri=feed_url),
        "count": MAX_ENTRIES
    }

    if last_id:
        params["continuation"] = last_id

    request = "{base_url}?{params}".format(base_url=BASE_URL, params=urlencode(params))
    items = fetch_json(request)

    if not items:
        return items

    # Sometimes it returns continuation as first item, sometimes not
    # Need to account for that properly
    if last_id and items[0]["id"] == last_id:
        items = items[1:]

    return list(map(map_post, items))

def sanitize_uri(url):
    uri = urlparse(url).netloc

    port_separator = uri.find(':')
    if port_separator > -1:
        return uri[0:port_separator]

    return uri

def fetch_json(request):
    try:
        response = urlopen(request)
    except (HTTPException, OSError) as err:
        log.error("Error fetching blog info: {err}".format(err=err))
        return []

    if response.getcode() != 200:
        data = response.read().decode("UTF-8")
        log.error("Error processing {url}: {data}".format(url=url, data=data))
        return []

    content_type = response.info().get_content_type()

    if content_type != "application/json":
        log.error("Unexpected Content-Type: {type}".format(type=content_type))
        return []

    try:
        data = json.load(response)
    except JSONDecodeError as err:
        log.error("Error parsing blog info: {err}".format(err=err))
        return []

    return data.get("items", [])

def map_post(data):
    source = data.get("originId", data.get("alternate", [{}])[0].get("href", ""))
    info = {}

    match = TUMBLR_ID.search(source)
    if match:
        info["id"] = int(match.group(1))
    else:
        info["id"] = 0

    info["title"] = data.get("title", data["id"])
    info["contents"] = data.get("summary", {}).get("content", "")

    timestamp = int(int(data.get("published", 0)) / 1000)
    info["created_at"] = datetime.utcfromtimestamp(timestamp).strftime(RFC3339_STRING)

    match = IMG_SRC.findall(info["contents"])
    if match:
        info["images"] = list(match)

    for image in info.get("images", []):
        download_image(image)

    return info

def download_image(url):
    root_path = os.path.join(os.getcwd(), sanitize_uri(feed_url), "images")
    make_dir(root_path, True)

    file_name = os.path.join(root_path, url.split('/')[-1])
    breadcrumbs = file_name.split('_')
    breadcrumbs[-1] = "1280{ext_name}".format(ext_name=os.path.splitext(file_name)[1])
    file_name = '_'.join(breadcrumbs)
    del breadcrumbs

    try:
        response = urlopen(url)
    except (HTTPException, OSError) as err:
        log.error("Error fetching image {name}: {err}".format(err=err, name=file_name))
        return None

    with open(file_name, 'wb') as out_file:
        copyfileobj(response, out_file)

    return None

if __name__ == "__main__":
    try:
       feed_url = sys.argv[1]
       log = get_logger(feed_url)
    except IndexError:
        print("You must provide tumblr feed link!")

    run(feed_url)