Advertisement
Guest User

basic Substack scraper in Python

a guest
Oct 13th, 2022
654
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.65 KB | None | 0 0
  1. # Simple python script to download the text of public Substack posts and comments.
  2. # Sole parameter is the name of the Substack blog to download; output consists of a
  3. # CSV file of post metadata, a CSV file of comments, and a directory of HTML
  4. # files containing the full text of each post. Future changes to Substack may
  5. # cause this script to break.
  6.  
  7. import collections
  8. import os
  9. import pandas as pd
  10. import requests
  11. import time
  12. import random
  13. import json
  14. import sys
  15.  
  16. blogger = sys.argv[1]
  17.  
  18. archive_columns = [["id"],
  19. ["publication_id", "publicationID"],
  20. ["title"],
  21. ["subtitle"],
  22. ["type"],
  23. ["slug"],
  24. ["post_date", "utcTime", lambda x: pd.to_datetime (x)],
  25. ["audience"],
  26. ["write_comment_permissions", "commentPermissions"],
  27. ["canonical_url", "url"],
  28. ["section_id", "sectionID"],
  29. ["reactions", "likes", lambda x: x[chr (10084)]],
  30. ["comment_count", "comments"],
  31. ["description"],
  32. ["truncated_body_text", "previewText"],
  33. ["wordcount", "wordCount"],
  34. ["publishedBylines", "authorID", lambda x: x[0]["id"]],
  35. ["publishedBylines", "authorName", lambda x: x[0]["name"]]]
  36. comment_columns = [["id"],
  37. ["name", "handle"],
  38. ["body", "text"],
  39. ["post_id", "postID"],
  40. ["user_id", "userID"],
  41. ["ancestor_path", "ancestorPath"],
  42. ["type"],
  43. ["deleted"],
  44. ["date", "utcTime", lambda x: pd.to_datetime (x)],
  45. ["edited_at", "editTime", lambda x: pd.to_datetime (x)],
  46. ["reactions", "likes", lambda x: x[chr (10084)]],
  47. ["user_banned", "userBanned"],
  48. ["user_banned_for_comment", "bannedForComment"],
  49. ["score"]]
  50.  
  51. def parse_comments (comments, rows, blogger, comment_columns, depth):
  52. count = len (comments)
  53. for r in comments:
  54. row = collections.OrderedDict ([("substack", blogger)])
  55. for col in comment_columns:
  56. key = col[0] if len (col) == 1 else col[1]
  57. try:
  58. item = r[col[0]]
  59. if len (col) > 2:
  60. item = col[2](item)
  61. except:
  62. item = None
  63. row[key] = item
  64. rows.append (row)
  65. count = count + parse_comments (r["children"], rows, blogger, comment_columns, depth + 1)
  66. return count
  67.  
  68. page_size = 12
  69. offset = 0
  70. archive_url = "https://" + blogger + ".substack.com/api/v1/archive?sort=new&search=&offset="
  71. archive_url_end = "&limit=" + str (page_size)
  72. comments_url = "https://" + blogger + ".substack.com/api/v1/post/"
  73. post_url = "https://" + blogger + ".substack.com/api/v1/posts/"
  74. more = True
  75. rows = []
  76. while more:
  77. url = archive_url + str (offset) + archive_url_end
  78. print (url)
  79. try:
  80. r = requests.get (url)
  81. data = r.json ()
  82. except:
  83. print ("error, sleeping for 60 seconds...")
  84. time.sleep (60)
  85. data = None
  86. if data is not None:
  87. for r in data:
  88. row = collections.OrderedDict ([("handle", blogger)])
  89. for col in archive_columns:
  90. key = col[0] if len (col) == 1 else col[1]
  91. try:
  92. item = r[col[0]]
  93. if len (col) > 2:
  94. item = col[2](item)
  95. except:
  96. item = None
  97. print ("error: " + key)
  98. row[key] = item
  99. rows.append (row)
  100. if len (data) == 0:
  101. more = False
  102. else:
  103. offset = offset + len (data)
  104. time.sleep (random.randint (2, 7))
  105. print (len (rows))
  106. df = pd.DataFrame (rows)
  107. print (len (set (df["id"])))
  108. df.to_csv (blogger + "_substack_archive.csv", index=False)
  109.  
  110. path = blogger + "_substack/"
  111. fpath = os.path.dirname (path)
  112. if not os.path.exists (fpath):
  113. os.makedirs (fpath)
  114. rows = []
  115. for ix, row in df.iterrows ():
  116. post = row["id"]
  117. slug = row["slug"]
  118. url = post_url + slug
  119. print (url)
  120. going = True
  121. while going:
  122. try:
  123. r = requests.get (url)
  124. data = json.loads (r.text)
  125. going = False
  126. html = data["body_html"]
  127. except:
  128. html = None
  129. if going:
  130. print ("error, sleeping for 60 seconds...")
  131. time.sleep (60)
  132. if html is not None:
  133. with open (path + str (post) + "-" + slug + ".html", "w") as f:
  134. f.write (html)
  135. latest = None
  136. more = True
  137. url = comments_url + str (post) + "/comments?token=&all_comments=true&sort=most_recent_first"
  138. print (url)
  139. data = None
  140. while data is None:
  141. try:
  142. r = requests.get (url)
  143. data = r.json ()
  144. if data is None:
  145. break
  146. except:
  147. print ("error, sleeping for 60 seconds...")
  148. time.sleep (60)
  149. data = None
  150. if data is not None:
  151. print (str (post) + ": " + str (
  152. parse_comments (data["comments"], rows, blogger, comment_columns, 0)))
  153. time.sleep (random.randint (2, 6))
  154. df = pd.DataFrame (rows)
  155. df.to_csv (blogger + "_substack_comments.csv", index=False)
  156. df = pd.DataFrame (rows)
  157. print (str (len (set (df["id"]))) + " comments downloaded")
  158. df.to_csv (blogger + "_substack_comments.csv", index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement