basic Substack scraper in Python

# Simple python script to download the text of public Substack posts and comments.
# Sole parameter is the name of the Substack blog to download; output consists of a
# CSV file of post metadata, a CSV file of comments, and a directory of HTML
# files containing the full text of each post. Future changes to Substack may
# cause this script to break.

import collections
import os
import pandas as pd
import requests
import time
import random
import json
import sys

blogger = sys.argv[1]

archive_columns = [["id"],
                   ["publication_id", "publicationID"],
                   ["title"],
                   ["subtitle"],
                   ["type"],
                   ["slug"],
                   ["post_date", "utcTime", lambda x: pd.to_datetime (x)],
                   ["audience"],
                   ["write_comment_permissions", "commentPermissions"],
                   ["canonical_url", "url"],
                   ["section_id", "sectionID"],
                   ["reactions", "likes", lambda x: x[chr (10084)]],
                   ["comment_count", "comments"],
                   ["description"],
                   ["truncated_body_text", "previewText"],
                   ["wordcount", "wordCount"],
                   ["publishedBylines", "authorID", lambda x: x[0]["id"]],
                   ["publishedBylines", "authorName", lambda x: x[0]["name"]]]
comment_columns = [["id"],
                   ["name", "handle"],
                   ["body", "text"],
                   ["post_id", "postID"],
                   ["user_id", "userID"],
                   ["ancestor_path", "ancestorPath"],
                   ["type"],
                   ["deleted"],
                   ["date", "utcTime", lambda x: pd.to_datetime (x)],
                   ["edited_at", "editTime", lambda x: pd.to_datetime (x)],
                   ["reactions", "likes", lambda x: x[chr (10084)]],
                   ["user_banned", "userBanned"],
                   ["user_banned_for_comment", "bannedForComment"],
                   ["score"]]

def parse_comments (comments, rows, blogger, comment_columns, depth):
    count = len (comments)
    for r in comments:
        row = collections.OrderedDict ([("substack", blogger)])
        for col in comment_columns:
            key = col[0] if len (col) == 1 else col[1]
            try:
                item = r[col[0]]
                if len (col) > 2:
                    item = col[2](item)
            except:
                item = None
            row[key] = item
        rows.append (row)
        count = count + parse_comments (r["children"], rows, blogger, comment_columns, depth + 1)
    return count

page_size = 12
offset = 0
archive_url = "https://" + blogger + ".substack.com/api/v1/archive?sort=new&search=&offset="
archive_url_end = "&limit=" + str (page_size)
comments_url = "https://" + blogger + ".substack.com/api/v1/post/"
post_url = "https://" + blogger + ".substack.com/api/v1/posts/"
more = True
rows = []
while more:
    url = archive_url + str (offset) + archive_url_end
    print (url)
    try:
        r = requests.get (url)
        data = r.json ()
    except:
        print ("error, sleeping for 60 seconds...")
        time.sleep (60)
        data = None
    if data is not None:
        for r in data:
            row = collections.OrderedDict ([("handle", blogger)])
            for col in archive_columns:
                key = col[0] if len (col) == 1 else col[1]
                try:
                    item = r[col[0]]
                    if len (col) > 2:
                        item = col[2](item)
                except:
                    item = None
                    print ("error: " + key)
                row[key] = item
            rows.append (row)
        if len (data) == 0:
            more = False
        else:
            offset = offset + len (data)
    time.sleep (random.randint (2, 7))
print (len (rows))
df = pd.DataFrame (rows)
print (len (set (df["id"])))
df.to_csv (blogger + "_substack_archive.csv", index=False)

path = blogger + "_substack/"
fpath = os.path.dirname (path)
if not os.path.exists (fpath):
    os.makedirs (fpath)
rows = []
for ix, row in df.iterrows ():
    post = row["id"]
    slug = row["slug"]
    url = post_url + slug
    print (url)
    going = True
    while going:
        try:
            r = requests.get (url)
            data = json.loads (r.text)
            going = False
            html = data["body_html"]
        except:
            html = None
            if going:
                print ("error, sleeping for 60 seconds...")
                time.sleep (60)
    if html is not None:
        with open (path + str (post) + "-" + slug + ".html", "w") as f:
            f.write (html)
    latest = None
    more = True
    url = comments_url + str (post) + "/comments?token=&all_comments=true&sort=most_recent_first"
    print (url)
    data = None
    while data is None:
        try:
            r = requests.get (url)
            data = r.json ()
            if data is None:
                break
        except:
            print ("error, sleeping for 60 seconds...")
            time.sleep (60)
            data = None
    if data is not None:
        print (str (post) + ": " + str (
               parse_comments (data["comments"], rows, blogger, comment_columns, 0)))
    time.sleep (random.randint (2, 6))
    df = pd.DataFrame (rows)
    df.to_csv (blogger + "_substack_comments.csv", index=False)
df = pd.DataFrame (rows)
print (str (len (set (df["id"]))) + " comments downloaded")
df.to_csv (blogger + "_substack_comments.csv", index=False)