Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json
- # Simple script to split the .json file into many smaller .txt's only containing the body of the comments
- # Reddit post here: http://www.reddit.com/r/datasets/comments/1mbsa2/155m_reddit_comments_over_15_days/
- json_comments = 'dedupped1.json'
- basename_out = 'folder1/comments_'
- counter = 1
- outfile = open(basename_out + str(counter) + '.txt','w')
- split_size = 250000
- with open(json_comments) as f:
- for pos,line in enumerate(f):
- outfile.write( json.loads(line)['body'].encode('utf-8') )
- if( pos % split_size == 0 and pos != 0 ):
- outfile.close()
- counter += 1
- outfile = open(basename_out + str(counter) + '.txt','w')
- print(pos)
- outfile.close()
- print('Done')
Add Comment
Please, Sign In to add comment