Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # 1. fix for merged Alpaca-CoT
- import ijson, json
- from tqdm import tqdm
- import codecs, os
- def add_comma_end_line_file(input_filename, output_filename):
- with open(input_filename, 'r', encoding='utf-8-sig') as input_file, open(output_filename, 'w', encoding='utf-8') as output_file:
- prev_line = input_file.readline()
- for line in input_file: # Add a comma at the end of the previous line and write it to the output file
- output_file.write(prev_line.rstrip() + ',n')
- prev_line = line
- # Write the last line without a comma at the end
- output_file.write(prev_line)
- add_comma_end_line_file('input.json', 'input_comma.json')
- # 2. fix for merged Alpaca-CoT
- import ijson, json
- from tqdm import tqdm
- import codecs
- def format_json_file(input_filename, output_filename):
- with codecs.open(input_filename, 'r', encoding='utf-8-sig') as input_file, open(output_filename, 'w', encoding='utf-8') as output_file:
- objects = ijson.items(input_file, 'item') # Parse the JSON objects from the input file one by one
- output_file.write('[n')
- first = True
- for obj in tqdm(objects, desc="Processing"):
- if not first:
- output_file.write(',n')
- output_file.write(' {n')
- output_file.write(' "instruction": ' + json.dumps(obj['instruction'], ensure_ascii=False) + ',n')
- output_file.write(' "input": ' + json.dumps(obj['input'], ensure_ascii=False) + ',n')
- output_file.write(' "output": ' + json.dumps(obj['output'], ensure_ascii=False) + 'n')
- output_file.write(' }')
- first = False
- output_file.write('n]')
- format_json_file('input_comma.json', 'input_format.json')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement