1. import json
  2.  
  3. # Simple script to split the .json file into many smaller .txt's only containing the body of the comments
  4. # Reddit post here: http://www.reddit.com/r/datasets/comments/1mbsa2/155m_reddit_comments_over_15_days/
  5.  
  6. json_comments = 'dedupped1.json'
  7. basename_out = 'folder1/comments_'
  8. counter = 1
  9. outfile = open(basename_out + str(counter) + '.txt','w')
  10. split_size = 250000
  11.  
  12. with open(json_comments) as f:
  13.     for pos,line in enumerate(f):
  14.         outfile.write( json.loads(line)['body'].encode('utf-8') )
  15.         if( pos % split_size == 0 and pos != 0 ):
  16.             outfile.close()
  17.             counter += 1
  18.             outfile = open(basename_out + str(counter) + '.txt','w')
  19.             print(pos)
  20.  
  21. outfile.close()
  22. print('Done')