imcrazytwkr

Stale tumblr checker

Dec 12th, 2018
215
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.98 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. # encoding: utf-8
  3.  
  4. from urllib.request import urlopen
  5. from urllib.parse import urlparse, urlencode
  6. from http.client import HTTPException
  7. from datetime import datetime
  8. from sys import stdout, argv
  9. from os import getenv
  10.  
  11. import logging
  12. import json
  13.  
  14. API_KEY = getenv("TUMBLR_KEY", "8YUsKJvcJxo2MDwmWMDiXZGuMuIbeCwuQGP5ZHSEA4jBJPMnJT")
  15. PARAMS = urlencode({ "api_key": API_KEY })
  16.  
  17. RFC3339_STRING = "%Y-%m-%dT%H:%M:%S.%fZ"
  18.  
  19. def get_logger(name, level=logging.INFO):
  20.     log_format = logging.Formatter('[%(asctime)s] (%(name)s) %(levelname)s: %(message)s')
  21.  
  22.     std_output = logging.StreamHandler(stdout)
  23.     std_output.setFormatter(log_format)
  24.     std_output.setLevel(level)
  25.  
  26.     logger = logging.getLogger(name)
  27.     logger.setLevel(logging.DEBUG)
  28.     logger.addHandler(std_output)
  29.     return logger
  30.  
  31. # Source file parsing
  32. def parse_list(filename):
  33.     result = []
  34.  
  35.     with open(filename, 'r', encoding="UTF-8") as reader:
  36.         for line in reader:
  37.             clean_line = line.strip()
  38.             if clean_line:
  39.                 result.append(urlparse(clean_line).netloc or clean_line)
  40.  
  41.     return frozenset(result)
  42.  
  43. def parse_blog(blog_url):
  44.     log = get_logger(blog_url)
  45.  
  46.     request = "https://api.tumblr.com/v2/blog/{uri}/info?{params}".format(
  47.         params = PARAMS,
  48.         uri = blog_url
  49.     )
  50.  
  51.     try:
  52.         response = urlopen(request)
  53.     except (HTTPException, OSError) as err:
  54.         log.error("Error fetching blog info: {err}".format(err=err))
  55.         return None
  56.  
  57.     content_type = response.info().get_content_type()
  58.  
  59.     if content_type != "application/json":
  60.         log.error("Unexpected Content-Type: {type}".format(type=content_type))
  61.         return None
  62.  
  63.     try:
  64.         data = json.load(response)
  65.     except JSONDecodeError as err:
  66.         log.error("Error parsing blog info: {err}".format(err=err))
  67.         return None
  68.  
  69.     status = data.get("meta", {}).get("status", response.getcode())
  70.  
  71.     if status != 200:
  72.         log.error("Error processing {url}: {data}".format(url=url, data=data))
  73.         return None
  74.  
  75.     info = data.get("response", {}).get("blog", {})
  76.     post_count = info.get("posts", 0)
  77.  
  78.     if post_count < 1:
  79.         return None
  80.  
  81.     return ','.join([
  82.         blog_url,
  83.         str(post_count),
  84.         datetime.utcfromtimestamp(int(info.get("updated", 0))).strftime(RFC3339_STRING)
  85.     ])
  86.  
  87.  
  88. if __name__ == "__main__":
  89.     # Parsing arguments
  90.     try:
  91.        source_file = argv[1]
  92.     except IndexError:
  93.        source_file = "tumblrs_raw.txt"
  94.  
  95.     try:
  96.         dest_file = argv[2]
  97.     except IndexError:
  98.         dest_file = "tumblrs_alive.txt"
  99.  
  100.     # Checking blogs
  101.     with open(dest_file, 'a', encoding="UTF-8") as out_file:
  102.         print("url,post_count,last_updated", file=out_file)
  103.         for url in sorted(parse_list(source_file)):
  104.             if url:
  105.                 csv_line = parse_blog(url)
  106.                 if csv_line:
  107.                     print(blog_url, file=out_file)
Advertisement
Add Comment
Please, Sign In to add comment