import json
# Simple script to split the .json file into many smaller .txt's only containing the body of the comments
# Reddit post here: http://www.reddit.com/r/datasets/comments/1mbsa2/155m_reddit_comments_over_15_days/
json_comments = 'dedupped1.json'
basename_out = 'folder1/comments_'
counter = 1
outfile = open(basename_out + str(counter) + '.txt','w')
split_size = 250000
with open(json_comments) as f:
for pos,line in enumerate(f):
outfile.write( json.loads(line)['body'].encode('utf-8') )
if( pos % split_size == 0 and pos != 0 ):
outfile.close()
counter += 1
outfile = open(basename_out + str(counter) + '.txt','w')
print(pos)
outfile.close()
print('Done')