Advertisement
Guest User

fix_liner_merged_Alpaca-CoT.py

a guest
Apr 20th, 2024
40
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.75 KB | Source Code | 0 0
  1. # 1. fix for merged Alpaca-CoT
  2. import ijson, json
  3. from tqdm import tqdm
  4. import codecs, os
  5.  
  6. def add_comma_end_line_file(input_filename, output_filename):
  7.     with open(input_filename, 'r', encoding='utf-8-sig') as input_file, open(output_filename, 'w', encoding='utf-8') as output_file:
  8.         prev_line = input_file.readline()
  9.         for line in input_file: # Add a comma at the end of the previous line and write it to the output file
  10.             output_file.write(prev_line.rstrip() + ',n')
  11.             prev_line = line
  12.         # Write the last line without a comma at the end
  13.         output_file.write(prev_line)
  14.  
  15. add_comma_end_line_file('input.json', 'input_comma.json')
  16.  
  17.  
  18. # 2. fix for merged Alpaca-CoT
  19. import ijson, json
  20. from tqdm import tqdm
  21. import codecs
  22.  
  23. def format_json_file(input_filename, output_filename):
  24.     with codecs.open(input_filename, 'r', encoding='utf-8-sig') as input_file, open(output_filename, 'w', encoding='utf-8') as output_file:
  25.        
  26.         objects = ijson.items(input_file, 'item') # Parse the JSON objects from the input file one by one
  27.         output_file.write('[n')
  28.         first = True
  29.         for obj in tqdm(objects, desc="Processing"):
  30.             if not first:
  31.                 output_file.write(',n')
  32.             output_file.write('  {n')
  33.             output_file.write('    "instruction": ' + json.dumps(obj['instruction'], ensure_ascii=False) + ',n')
  34.             output_file.write('    "input": ' + json.dumps(obj['input'], ensure_ascii=False) + ',n')
  35.             output_file.write('    "output": ' + json.dumps(obj['output'], ensure_ascii=False) + 'n')
  36.             output_file.write('  }')
  37.             first = False
  38.         output_file.write('n]')
  39.  
  40. format_json_file('input_comma.json', 'input_format.json')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement