import json # Simple script to split the .json file into many smaller .txt's only containing the body of the comments # Reddit post here: http://www.reddit.com/r/datasets/comments/1mbsa2/155m_reddit_comments_over_15_days/ json_comments = 'dedupped1.json' basename_out = 'folder1/comments_' counter = 1 outfile = open(basename_out + str(counter) + '.txt','w') split_size = 250000 with open(json_comments) as f: for pos,line in enumerate(f): outfile.write( json.loads(line)['body'].encode('utf-8') ) if( pos % split_size == 0 and pos != 0 ): outfile.close() counter += 1 outfile = open(basename_out + str(counter) + '.txt','w') print(pos) outfile.close() print('Done')