import json

# Simple script to split the .json file into many smaller .txt's only containing the body of the comments
# Reddit post here: http://www.reddit.com/r/datasets/comments/1mbsa2/155m_reddit_comments_over_15_days/

json_comments = 'dedupped1.json'
basename_out = 'folder1/comments_'
counter = 1
outfile = open(basename_out + str(counter) + '.txt','w')
split_size = 250000

with open(json_comments) as f:
    for pos,line in enumerate(f):
        outfile.write( json.loads(line)['body'].encode('utf-8') )
        if( pos % split_size == 0 and pos != 0 ):
            outfile.close()
            counter += 1
            outfile = open(basename_out + str(counter) + '.txt','w')
            print(pos)

outfile.close()
print('Done')