Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- # encoding: utf-8
- from urllib.request import urlopen
- from urllib.parse import urlparse, urlencode
- from http.client import HTTPException
- from shutil import copyfileobj
- from datetime import datetime
- import logging
- import json
- import sys
- import re
- import os
- BASE_URL = "https://feedly.com/v3/streams/contents"
- TUMBLR_ID = re.compile(r"\/(\d+)(?:\/|\Z)")
- IMG_SRC = re.compile(r"<img\s+src\=\"([^\"]*)\"\s*\/?>")
- RFC3339_STRING = "%Y-%m-%dT%H:%M:%S.%fZ"
- MAX_ENTRIES = 999
- def run(feed_url):
- count = MAX_ENTRIES
- last_id = ""
- all_posts = []
- while count >= MAX_ENTRIES:
- items = fetch_page(last_id)
- count = len(items)
- if count:
- last_id = items[-1]["id"]
- all_posts.extend(items)
- try:
- root_path = os.path.join(os.getcwd(), sanitize_uri(feed_url))
- file_path = os.path.join(root_path, "posts.json")
- make_dir(root_path, True)
- with open(file_path, mode='w', encoding="UTF-8") as out_file:
- json.dump(all_posts, out_file, ensure_ascii=True, indent=2, sort_keys=True)
- except OSError as err:
- log.error("Error writing blog info: {err}".format(err = err))
- def make_dir(dir_path, recursive=False):
- if os.path.exists(dir_path):
- if os.path.isdir(dir_path):
- return
- else:
- os.remove(dir_path)
- try:
- if recursive:
- os.makedirs(dir_path)
- else:
- os.mkdir(dir_path)
- except OSError as err:
- if err.errno != errno.EEXIST:
- raise
- def get_logger(name, level=logging.INFO):
- log_format = logging.Formatter('[%(asctime)s] (%(name)s) %(levelname)s: %(message)s')
- std_output = logging.StreamHandler(sys.stdout)
- std_output.setFormatter(log_format)
- std_output.setLevel(level)
- logger = logging.getLogger(name)
- logger.setLevel(logging.DEBUG)
- logger.addHandler(std_output)
- return logger
- def fetch_page(last_id = ""):
- params = {
- "streamId": "feed/{uri}".format(uri=feed_url),
- "count": MAX_ENTRIES
- }
- if last_id:
- params["continuation"] = last_id
- request = "{base_url}?{params}".format(base_url=BASE_URL, params=urlencode(params))
- items = fetch_json(request)
- if not items:
- return items
- # Sometimes it returns continuation as first item, sometimes not
- # Need to account for that properly
- if last_id and items[0]["id"] == last_id:
- items = items[1:]
- return list(map(map_post, items))
- def sanitize_uri(url):
- uri = urlparse(url).netloc
- port_separator = uri.find(':')
- if port_separator > -1:
- return uri[0:port_separator]
- return uri
- def fetch_json(request):
- try:
- response = urlopen(request)
- except (HTTPException, OSError) as err:
- log.error("Error fetching blog info: {err}".format(err=err))
- return []
- if response.getcode() != 200:
- data = response.read().decode("UTF-8")
- log.error("Error processing {url}: {data}".format(url=url, data=data))
- return []
- content_type = response.info().get_content_type()
- if content_type != "application/json":
- log.error("Unexpected Content-Type: {type}".format(type=content_type))
- return []
- try:
- data = json.load(response)
- except JSONDecodeError as err:
- log.error("Error parsing blog info: {err}".format(err=err))
- return []
- return data.get("items", [])
- def map_post(data):
- source = data.get("originId", data.get("alternate", [{}])[0].get("href", ""))
- info = {}
- match = TUMBLR_ID.search(source)
- if match:
- info["id"] = int(match.group(1))
- else:
- info["id"] = 0
- info["title"] = data.get("title", data["id"])
- info["contents"] = data.get("summary", {}).get("content", "")
- timestamp = int(int(data.get("published", 0)) / 1000)
- info["created_at"] = datetime.utcfromtimestamp(timestamp).strftime(RFC3339_STRING)
- match = IMG_SRC.findall(info["contents"])
- if match:
- info["images"] = list(match)
- for image in info.get("images", []):
- download_image(image)
- return info
- def download_image(url):
- root_path = os.path.join(os.getcwd(), sanitize_uri(feed_url), "images")
- make_dir(root_path, True)
- file_name = os.path.join(root_path, url.split('/')[-1])
- breadcrumbs = file_name.split('_')
- breadcrumbs[-1] = "1280{ext_name}".format(ext_name=os.path.splitext(file_name)[1])
- file_name = '_'.join(breadcrumbs)
- del breadcrumbs
- try:
- response = urlopen(url)
- except (HTTPException, OSError) as err:
- log.error("Error fetching image {name}: {err}".format(err=err, name=file_name))
- return None
- with open(file_name, 'wb') as out_file:
- copyfileobj(response, out_file)
- return None
- if __name__ == "__main__":
- try:
- feed_url = sys.argv[1]
- log = get_logger(feed_url)
- except IndexError:
- print("You must provide tumblr feed link!")
- run(feed_url)
Add Comment
Please, Sign In to add comment