Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Simple python script to download the text of public Substack posts and comments.
- # Sole parameter is the name of the Substack blog to download; output consists of a
- # CSV file of post metadata, a CSV file of comments, and a directory of HTML
- # files containing the full text of each post. Future changes to Substack may
- # cause this script to break.
- import collections
- import os
- import pandas as pd
- import requests
- import time
- import random
- import json
- import sys
- blogger = sys.argv[1]
- archive_columns = [["id"],
- ["publication_id", "publicationID"],
- ["title"],
- ["subtitle"],
- ["type"],
- ["slug"],
- ["post_date", "utcTime", lambda x: pd.to_datetime (x)],
- ["audience"],
- ["write_comment_permissions", "commentPermissions"],
- ["canonical_url", "url"],
- ["section_id", "sectionID"],
- ["reactions", "likes", lambda x: x[chr (10084)]],
- ["comment_count", "comments"],
- ["description"],
- ["truncated_body_text", "previewText"],
- ["wordcount", "wordCount"],
- ["publishedBylines", "authorID", lambda x: x[0]["id"]],
- ["publishedBylines", "authorName", lambda x: x[0]["name"]]]
- comment_columns = [["id"],
- ["name", "handle"],
- ["body", "text"],
- ["post_id", "postID"],
- ["user_id", "userID"],
- ["ancestor_path", "ancestorPath"],
- ["type"],
- ["deleted"],
- ["date", "utcTime", lambda x: pd.to_datetime (x)],
- ["edited_at", "editTime", lambda x: pd.to_datetime (x)],
- ["reactions", "likes", lambda x: x[chr (10084)]],
- ["user_banned", "userBanned"],
- ["user_banned_for_comment", "bannedForComment"],
- ["score"]]
- def parse_comments (comments, rows, blogger, comment_columns, depth):
- count = len (comments)
- for r in comments:
- row = collections.OrderedDict ([("substack", blogger)])
- for col in comment_columns:
- key = col[0] if len (col) == 1 else col[1]
- try:
- item = r[col[0]]
- if len (col) > 2:
- item = col[2](item)
- except:
- item = None
- row[key] = item
- rows.append (row)
- count = count + parse_comments (r["children"], rows, blogger, comment_columns, depth + 1)
- return count
- page_size = 12
- offset = 0
- archive_url = "https://" + blogger + ".substack.com/api/v1/archive?sort=new&search=&offset="
- archive_url_end = "&limit=" + str (page_size)
- comments_url = "https://" + blogger + ".substack.com/api/v1/post/"
- post_url = "https://" + blogger + ".substack.com/api/v1/posts/"
- more = True
- rows = []
- while more:
- url = archive_url + str (offset) + archive_url_end
- print (url)
- try:
- r = requests.get (url)
- data = r.json ()
- except:
- print ("error, sleeping for 60 seconds...")
- time.sleep (60)
- data = None
- if data is not None:
- for r in data:
- row = collections.OrderedDict ([("handle", blogger)])
- for col in archive_columns:
- key = col[0] if len (col) == 1 else col[1]
- try:
- item = r[col[0]]
- if len (col) > 2:
- item = col[2](item)
- except:
- item = None
- print ("error: " + key)
- row[key] = item
- rows.append (row)
- if len (data) == 0:
- more = False
- else:
- offset = offset + len (data)
- time.sleep (random.randint (2, 7))
- print (len (rows))
- df = pd.DataFrame (rows)
- print (len (set (df["id"])))
- df.to_csv (blogger + "_substack_archive.csv", index=False)
- path = blogger + "_substack/"
- fpath = os.path.dirname (path)
- if not os.path.exists (fpath):
- os.makedirs (fpath)
- rows = []
- for ix, row in df.iterrows ():
- post = row["id"]
- slug = row["slug"]
- url = post_url + slug
- print (url)
- going = True
- while going:
- try:
- r = requests.get (url)
- data = json.loads (r.text)
- going = False
- html = data["body_html"]
- except:
- html = None
- if going:
- print ("error, sleeping for 60 seconds...")
- time.sleep (60)
- if html is not None:
- with open (path + str (post) + "-" + slug + ".html", "w") as f:
- f.write (html)
- latest = None
- more = True
- url = comments_url + str (post) + "/comments?token=&all_comments=true&sort=most_recent_first"
- print (url)
- data = None
- while data is None:
- try:
- r = requests.get (url)
- data = r.json ()
- if data is None:
- break
- except:
- print ("error, sleeping for 60 seconds...")
- time.sleep (60)
- data = None
- if data is not None:
- print (str (post) + ": " + str (
- parse_comments (data["comments"], rows, blogger, comment_columns, 0)))
- time.sleep (random.randint (2, 6))
- df = pd.DataFrame (rows)
- df.to_csv (blogger + "_substack_comments.csv", index=False)
- df = pd.DataFrame (rows)
- print (str (len (set (df["id"]))) + " comments downloaded")
- df.to_csv (blogger + "_substack_comments.csv", index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement