Advertisement
Guest User

Wayback Machine tweet scraper

a guest
Jan 5th, 2022
895
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.58 KB | None | 0 0
  1. """
  2.  
  3. This is a very basic scraper for tweets archived on Wayback Machine that handles most tweets from
  4. November 2011 onward. It is rather slow and clunky and has plenty of room for improvement.
  5. To download an account's archived tweets:
  6.  
  7. python3 wayback.py [account]
  8.  
  9. i.e. to download archived tweets for @DrunkAlexJones:
  10.  
  11. python3 wayback.py DrunkAlexJones
  12.  
  13. This will create a DrunkAlexJones_wayback_tweets.csv file containing the archived tweets, as well as
  14. a DrunkAlexJones_wayback/ directory containing the raw HTML of the archives (which can get large).
  15.  
  16. Required libraries: bs4, pandas, requests
  17.  
  18. """
  19.  
  20.  
  21. import bs4
  22. import json
  23. import os
  24. import pandas as pd
  25. import requests
  26. import sys
  27. import time
  28.  
  29. # utility functions
  30. def snowflake2utc (sf):
  31. return (sf >> 22) + 1288834974657
  32.  
  33. def get_retweet (text):
  34. if text.startswith ("RT @"):
  35. handle = text.split ()[1][1:]
  36. if handle.endswith (":"):
  37. return handle[:-1]
  38. return ""
  39.  
  40. # handlers for tweet HTML from a few different eras
  41. def html1 (soup, elements, row):
  42. emoji = elements[0].find_all ("img", {"class" : "Emoji"})
  43. for e in emoji:
  44. e.replaceWith (e["alt"])
  45. text = elements[0].text.strip ()
  46. if len (text) == 0:
  47. return None
  48. else:
  49. row["text"] = text
  50. quote = elements[0].parent.find_next_sibling ("div")
  51. if quote is not None:
  52. try:
  53. qt_handle = quote.find ("span", {"class" : "username"}).text
  54. qt_text = quote.find ("div", {"class" : "QuoteTweet-text"}).text.strip ()
  55. row["quotedHandle"] = qt_handle.replace ("@", "")
  56. row["quotedText"] = qt_text
  57. except:
  58. row["quotedHandle"] = ""
  59. row["quotedText"] = ""
  60. reply_to = elements[0].parent.find_previous_sibling ("div")
  61. if reply_to is not None and "Replying to" in reply_to.text:
  62. try:
  63. reply_to_handle = reply_to.find ("span", {"class" : "username"}).text
  64. row["replyToHandle"] = reply_to_handle.replace ("@", "")
  65. except:
  66. row["replyToHandle"] = ""
  67. return row
  68.  
  69. def html2 (soup, elements, row):
  70. emoji = elements[0].find_all ("img", {"class" : "Emoji"})
  71. for e in emoji:
  72. e.replaceWith (e["alt"])
  73. text = elements[0].text.strip ()
  74. row["text"] = text
  75. row["quotedHandle"] = ""
  76. row["quotedText"] = ""
  77. if text.startswith ("@"):
  78. row["replyToHandle"] = text.split ()[0].replace ("@", "")
  79. return row
  80.  
  81. def html3 (soup, elements, row):
  82. element = elements[0].find ("p", {"class" : "js-tweet-text"})
  83. emoji = element.find_all ("img", {"class" : "Emoji"})
  84. for e in emoji:
  85. e.replaceWith (e["alt"])
  86. text = element.text.strip ()
  87. row["text"] = text
  88. row["quotedHandle"] = ""
  89. row["quotedText"] = ""
  90. if text.startswith ("@"):
  91. row["replyToHandle"] = text.split ()[0].replace ("@", "")
  92. return row
  93.  
  94. html_handlers = [["TweetTextSize--jumbo", "p", html1],
  95. ["TweetTextSize--26px", "p", html2],
  96. ["opened-tweet", "div", html3],
  97. ["preexpanded", "div", html3]]
  98.  
  99. def parse_html (text, row):
  100. soup = bs4.BeautifulSoup (text)
  101. for handler in html_handlers:
  102. elements = soup.find_all (handler[1], {"class" : handler[0]})
  103. count = len (elements)
  104. if count == 1:
  105. try:
  106. row = handler[2] (soup, elements, row)
  107. if row is not None:
  108. return row
  109. except:
  110. pass
  111. return None
  112.  
  113. def parse_json (text, row):
  114. data = json.loads (text)
  115. row["text"] = data["text"]
  116. if "in_reply_to_screen_name" in data:
  117. row["replyToHandle"] = data["in_reply_to_screen_name"]
  118. if "quoted_status" in data:
  119. quote = data["quoted_status"]
  120. row["quotedText"] = quote["text"]
  121. row["quotedHandle"] = quote["user"]["screen_name"]
  122. return row
  123.  
  124. parsers = {"html" : parse_html, "json" : parse_json}
  125.  
  126. handle = sys.argv[1]
  127.  
  128. # read list of captured URLS from from Wayback Machine
  129. url = "https://web.archive.org/web/timemap/json?url=https://twitter.com/" + handle \
  130. + "&matchType=prefix&collapse=urlkey&output=json&fl=original,mimetype," \
  131. + "timestamp,endtimestamp,groupcount,uniqcount&filter=!statuscode:[45].." \
  132. + "&limit=1000000&_=" + str (int (time.time () * 1000))
  133. r = requests.get (url)
  134. arc_file = handle + "_wayback.json"
  135. with open (arc_file, "w") as file:
  136. file.write (r.text)
  137.  
  138. # create a data frame of captured tweets
  139. json_data = json.loads (r.text)
  140. df = pd.DataFrame ([{"tweetURL" : j[0],
  141. "mime" : j[1],
  142. "t" : j[2]} for j in json_data])
  143. df = df[df["tweetURL"].str.contains ("status", regex=False)]
  144. df["id"] = df["tweetURL"].apply (lambda u: u.split ("/status/")[-1])
  145. df = df[df["id"].str.isnumeric ()]
  146. df["id"] = df["id"].astype (int)
  147. df = df[df["id"] > 292000000000000] #snowflake IDs only
  148. df["archiveURL"] = df.apply (lambda r: "https://web.archive.org/web/" \
  149. + r["t"] + "/" + r["tweetURL"], axis=1)
  150. print (str (len (df.index)) + " archived tweets to try")
  151. df.to_csv (handle + "_wayback_urls.csv", index=False)
  152.  
  153. # make directory, loop through, and download captures
  154. path = handle + "_wayback/"
  155. if not os.path.exists (path):
  156. os.makedirs (path)
  157. count = 0
  158. errors = 0
  159. for ix, row in df.iterrows ():
  160. retries = 3
  161. tweet_id = str (row["id"])
  162. mime = row["mime"]
  163. ftype = mime[mime.find ("/") + 1:]
  164. fname = path + tweet_id + "." + ftype
  165. if not os.path.exists (fname):
  166. while retries > 0:
  167. try:
  168. archive = row["archiveURL"]
  169. r = requests.get (archive, timeout=15)
  170. if len (r.text) == 0 or "<p>Job failed</p>" in r.text or \
  171. "<p>The Wayback Machine has not archived that URL.</p>" in r.text:
  172. print ("redirect or no content available " + tweet_id)
  173. errors = errors + 1
  174. retries = 0
  175. elif "<p>You have already reached the limit of active sessions.</p>" in r.text or \
  176. "<h1>504 Gateway Time-out</h1>" in r.text:
  177. print ("too many recent requests, sleeping...")
  178. time.sleep (15)
  179. else:
  180. with open (fname, "w") as file:
  181. file.write (r.text)
  182. retries = 0
  183. count = count + 1
  184. if count % 100 == 0:
  185. print (str (count) + " retrieved so far")
  186. except:
  187. time.sleep (5)
  188. retries = retries - 1
  189. if retries == 0:
  190. errors = errors + 1
  191. print ("error retrieving " + tweet_id)
  192. else:
  193. count = count + 1
  194.  
  195. # loop through captures and build CSV
  196. count = 0
  197. rows = []
  198. for file in os.listdir (path):
  199. try:
  200. mime = file[file.find (".") + 1:]
  201. if mime not in parsers:
  202. mime = "html"
  203. tweet_id = int (file.replace ("." + mime, ""))
  204. utc = snowflake2utc (tweet_id)
  205. row = {"id" : tweet_id, "utcTime" : utc, "type" : mime}
  206. with open (path + file) as fp:
  207. try:
  208. row = parsers[mime] (fp.read (), row)
  209. except:
  210. row = None
  211. if row is not None:
  212. rows.append (row)
  213. count = count + 1
  214. if count % 100 == 0:
  215. print (str (count) + " parsed successfully")
  216. else:
  217. print ("error: " + file)
  218. errors = errors + 1
  219. except:
  220. print ("error: " + file)
  221. errors = errors + 1
  222. df = pd.DataFrame (rows)
  223. df["handle"] = handle
  224. df["utcTime"] = pd.to_datetime (df["utcTime"], unit="ms")
  225. print (str (len (df.index)) + " Wayback Machine captures parsed successfully")
  226. print (str (errors) + " errors or missing captures")
  227. columns = ["handle", "tweetID", "utcTime", "archiveURL", "text", "type",
  228. "quotedHandle", "quotedText", "replyToHandle", "retweetHandle"]
  229. df["id"] = df["id"].astype (int)
  230. df["tweetID"] = df["id"]
  231. df0 = pd.read_csv (handle + "_wayback_urls.csv")
  232. df0["id"] = df0["id"].astype (int)
  233. df = df.merge (df0[["id", "archiveURL"]], on="id")
  234. df["retweetHandle"] = df["text"].apply (get_retweet)
  235. df = df.sort_values ("tweetID")[columns]
  236. df.to_csv (handle + "_wayback_tweets.csv", index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement