imcrazytwkr

Feedly dumper for tumblr blogs

Dec 9th, 2018
232
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.09 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. # encoding: utf-8
  3.  
  4. from urllib.request import urlopen
  5. from urllib.parse import urlparse, urlencode
  6. from http.client import HTTPException
  7. from shutil import copyfileobj
  8. from datetime import datetime
  9.  
  10. import logging
  11. import json
  12. import sys
  13. import re
  14. import os
  15.  
  16. BASE_URL = "https://feedly.com/v3/streams/contents"
  17. TUMBLR_ID = re.compile(r"\/(\d+)(?:\/|\Z)")
  18. IMG_SRC = re.compile(r"<img\s+src\=\"([^\"]*)\"\s*\/?>")
  19. RFC3339_STRING = "%Y-%m-%dT%H:%M:%S.%fZ"
  20. MAX_ENTRIES = 999
  21.  
  22. def run(feed_url):
  23.     count = MAX_ENTRIES
  24.     last_id = ""
  25.     all_posts = []
  26.  
  27.     while count >= MAX_ENTRIES:
  28.         items = fetch_page(last_id)
  29.         count = len(items)
  30.  
  31.         if count:
  32.             last_id = items[-1]["id"]
  33.             all_posts.extend(items)
  34.  
  35.     try:
  36.         root_path = os.path.join(os.getcwd(), sanitize_uri(feed_url))
  37.         file_path = os.path.join(root_path, "posts.json")
  38.         make_dir(root_path, True)
  39.  
  40.         with open(file_path, mode='w', encoding="UTF-8") as out_file:
  41.             json.dump(all_posts, out_file, ensure_ascii=True, indent=2, sort_keys=True)
  42.     except OSError as err:
  43.         log.error("Error writing blog info: {err}".format(err = err))
  44.  
  45. def make_dir(dir_path, recursive=False):
  46.     if os.path.exists(dir_path):
  47.         if os.path.isdir(dir_path):
  48.             return
  49.         else:
  50.             os.remove(dir_path)
  51.  
  52.     try:
  53.         if recursive:
  54.             os.makedirs(dir_path)
  55.         else:
  56.             os.mkdir(dir_path)
  57.     except OSError as err:
  58.         if err.errno != errno.EEXIST:
  59.             raise
  60.  
  61. def get_logger(name, level=logging.INFO):
  62.     log_format = logging.Formatter('[%(asctime)s] (%(name)s) %(levelname)s: %(message)s')
  63.  
  64.     std_output = logging.StreamHandler(sys.stdout)
  65.     std_output.setFormatter(log_format)
  66.     std_output.setLevel(level)
  67.  
  68.     logger = logging.getLogger(name)
  69.     logger.setLevel(logging.DEBUG)
  70.     logger.addHandler(std_output)
  71.     return logger
  72.  
  73. def fetch_page(last_id = ""):
  74.     params = {
  75.         "streamId": "feed/{uri}".format(uri=feed_url),
  76.         "count": MAX_ENTRIES
  77.     }
  78.  
  79.     if last_id:
  80.         params["continuation"] = last_id
  81.  
  82.     request = "{base_url}?{params}".format(base_url=BASE_URL, params=urlencode(params))
  83.     items = fetch_json(request)
  84.  
  85.     if not items:
  86.         return items
  87.  
  88.     # Sometimes it returns continuation as first item, sometimes not
  89.     # Need to account for that properly
  90.     if last_id and items[0]["id"] == last_id:
  91.         items = items[1:]
  92.  
  93.     return list(map(map_post, items))
  94.  
  95. def sanitize_uri(url):
  96.     uri = urlparse(url).netloc
  97.  
  98.     port_separator = uri.find(':')
  99.     if port_separator > -1:
  100.         return uri[0:port_separator]
  101.  
  102.     return uri
  103.  
  104. def fetch_json(request):
  105.     try:
  106.         response = urlopen(request)
  107.     except (HTTPException, OSError) as err:
  108.         log.error("Error fetching blog info: {err}".format(err=err))
  109.         return []
  110.  
  111.     if response.getcode() != 200:
  112.         data = response.read().decode("UTF-8")
  113.         log.error("Error processing {url}: {data}".format(url=url, data=data))
  114.         return []
  115.  
  116.     content_type = response.info().get_content_type()
  117.  
  118.     if content_type != "application/json":
  119.         log.error("Unexpected Content-Type: {type}".format(type=content_type))
  120.         return []
  121.  
  122.     try:
  123.         data = json.load(response)
  124.     except JSONDecodeError as err:
  125.         log.error("Error parsing blog info: {err}".format(err=err))
  126.         return []
  127.  
  128.     return data.get("items", [])
  129.  
  130. def map_post(data):
  131.     source = data.get("originId", data.get("alternate", [{}])[0].get("href", ""))
  132.     info = {}
  133.  
  134.     match = TUMBLR_ID.search(source)
  135.     if match:
  136.         info["id"] = int(match.group(1))
  137.     else:
  138.         info["id"] = 0
  139.  
  140.     info["title"] = data.get("title", data["id"])
  141.     info["contents"] = data.get("summary", {}).get("content", "")
  142.  
  143.     timestamp = int(int(data.get("published", 0)) / 1000)
  144.     info["created_at"] = datetime.utcfromtimestamp(timestamp).strftime(RFC3339_STRING)
  145.  
  146.     match = IMG_SRC.findall(info["contents"])
  147.     if match:
  148.         info["images"] = list(match)
  149.  
  150.     for image in info.get("images", []):
  151.         download_image(image)
  152.  
  153.     return info
  154.  
  155. def download_image(url):
  156.     root_path = os.path.join(os.getcwd(), sanitize_uri(feed_url), "images")
  157.     make_dir(root_path, True)
  158.  
  159.     file_name = os.path.join(root_path, url.split('/')[-1])
  160.     breadcrumbs = file_name.split('_')
  161.     breadcrumbs[-1] = "1280{ext_name}".format(ext_name=os.path.splitext(file_name)[1])
  162.     file_name = '_'.join(breadcrumbs)
  163.     del breadcrumbs
  164.  
  165.     try:
  166.         response = urlopen(url)
  167.     except (HTTPException, OSError) as err:
  168.         log.error("Error fetching image {name}: {err}".format(err=err, name=file_name))
  169.         return None
  170.  
  171.     with open(file_name, 'wb') as out_file:
  172.         copyfileobj(response, out_file)
  173.  
  174.     return None
  175.  
  176. if __name__ == "__main__":
  177.     try:
  178.        feed_url = sys.argv[1]
  179.        log = get_logger(feed_url)
  180.     except IndexError:
  181.         print("You must provide tumblr feed link!")
  182.  
  183.     run(feed_url)
Add Comment
Please, Sign In to add comment