Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from nltk.tokenize import word_tokenize
- import pickle
- import pprint
- import json
- """
- (heads, descs, keywords) = ([headline], [description], )
- """
- DATA_FILE = 'data/signalmedia-1m.jsonl.test'
- PICKLE_FILE = 'data/tokens.pkl'
- def write_to_pickle(filename, data):
- with open(filename, 'wb') as f:
- pickle.dump(data, f)
- def read_from_pickle(filename):
- with open(filename, 'r') as f:
- return pickle.load(f)
- def get_json_data(filename):
- heads = []
- descs = []
- HEADER = "title"
- DESCRIPTION = "content"
- with open(filename, 'r') as json_data:
- for json_object in json_data:
- heads.append(json.loads(json_object)[HEADER])
- descs.append(json.loads(json_object)[DESCRIPTION])
- return heads, descs, None
- data = get_json_data(DATA_FILE)
- write_to_pickle(PICKLE_FILE, data)
- data = read_from_pickle(PICKLE_FILE)
- pprint.pprint(data)
Add Comment
Please, Sign In to add comment