FaceDeer

Google BigQuery reddit comment parser

Jun 13th, 2016
125
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.80 KB | None | 0 0
  1. import json
  2. import re
  3.  
  4. with open("FaceDeer.json") as data:
  5.     jsondata = json.load(data)
  6.  
  7. with open("reddit archive data.txt") as newerdata:
  8.     newerjsondata = json.load(newerdata)
  9.  
  10. fullcomments = dict()
  11.  
  12. for key, comment in newerjsondata.items():
  13.     comdict = dict()
  14.     comdict['id'] = comment['id']
  15.     comdict['link_url'] = comment['link']
  16.     comdict['body'] = comment['body']
  17.     comdict['created'] = int(comment['created'])
  18.     comdict['link_title'] = comment['link_title']
  19.     comdict['subreddit'] = comment['subreddit']
  20.     comdict['link_id'] = None
  21.     comdict['author'] = None
  22.     comdict['fromBigQuery'] = False
  23.     fullcomments[comment['id']] = comdict
  24.  
  25. for comment in jsondata:
  26.  
  27.     if comment['id'] in fullcomments:
  28.         fullcomments[comment['id']]['link_id'] = comment['link_id']
  29.         fullcomments[comment['id']]['author'] = comment['author']
  30.     else:
  31.         link = "http://reddit.com/r/" + comment['subreddit'] + "/comments/" + re.sub('t[0-9]_','', comment['link_id']) + "/c/" + comment['id']
  32.         comdict = dict()
  33.         comdict['id'] = comment['id']
  34.         comdict['link_url'] = link
  35.         comdict['body'] = comment['body']
  36.         comdict['created'] = int(comment['created_utc'])
  37.         comdict['link_title'] = None
  38.         comdict['subreddit'] = comment['subreddit']
  39.         comdict['link_id'] = comment['link_id']
  40.         comdict['author'] = comment['author']
  41.         comdict['fromBigQuery'] = True
  42.         fullcomments[comment['id']] = comdict
  43.  
  44. ##with open("combined archive.txt", "w") as outfile:
  45. ##    json.dump(fullcomments, outfile)
  46.  
  47. commentlist = list(fullcomments.values())
  48.  
  49. result = sorted(commentlist, key= lambda comment: comment['created'])
  50.  
  51. print(result[0])
  52.  
  53. with open("sorted archive.txt", "w") as outfile:
  54.     json.dump(result, outfile)
Add Comment
Please, Sign In to add comment