Guest User

sharegpt2alcapa.py

a guest
Apr 23rd, 2024
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.27 KB | Source Code | 0 0
  1. import json
  2.  
  3. def combine_files(file1, file2, output_file):
  4.     combined = []
  5.    
  6.     for filename in [file1, file2]:
  7.         with open(filename, 'r', encoding='utf-8-sig') as f:
  8.             for i, line in enumerate(f, start=1):
  9.                 try:
  10.                     jsonl = json.loads(line)
  11.                     conversations = jsonl["conversations"]
  12.                     if len(conversations) < 2:
  13.                         print(f"Conversation with less than 2 elements in file {filename} on line {i}: {conversations}")
  14.                         continue
  15.                    
  16.                     history = []
  17.                     for j in range(0, len(conversations)-2, 2):
  18.                         if conversations[j+1]["value"] == "":
  19.                             print(f"Unpaired QA in file {filename} on line {i}: Question: {conversations[j]['value']}, Answer: {conversations[j+1]['value']}")
  20.                             continue
  21.                         history.append([conversations[j]["value"], conversations[j+1]["value"]])
  22.                    
  23.                     combined_data = {
  24.                         "instruction": conversations[-2]["value"],
  25.                         "input": "",
  26.                         "output": conversations[-1]["value"],
  27.                         "system": jsonl.get("system", ""),
  28.                         "history": history
  29.                     }
  30.                    
  31.                     combined.append(combined_data)
  32.                 except (json.JSONDecodeError, KeyError) as e:
  33.                     print(f"Error in file {filename} on line {i}: {line}")
  34.                     print(f"Conversation: {conversations}")
  35.                     raise e
  36.    
  37.     with open(output_file, 'w', encoding='utf-8') as out:
  38.         json.dump(combined, out, ensure_ascii=False, indent=2)
  39.  
  40.  
  41. combine_files('file1.jsonl', 'file2.jsonl', 'output-.json')
  42. import json
  43.  
  44. def combine_files(filenames, output_file):
  45.     combined = []
  46.     seen = set()
  47.    
  48.     for filename in filenames:
  49.         with open(filename, 'r', encoding='utf-8-sig') as f:
  50.             data = json.load(f)
  51.             for item in data:
  52.                 instruction = item.get('instruction', '').strip()
  53.                 input_ = item.get('input', '').strip()
  54.                 output = item.get('output', '').strip()
  55.                 system = item.get('system', '').strip()
  56.                 # default value for 'history'
  57.                 history = item.get('history', [])
  58.  
  59.                 if not output:
  60.                     continue
  61.  
  62.                 if not instruction and not input_:
  63.                     continue
  64.  
  65.                 if not all(key in item for key in ['instruction', 'input', 'output', 'system', 'history']):
  66.                     print(f"Missing key in item: {item}")
  67.                     continue
  68.                 item['instruction'] = instruction
  69.                 item['input'] = input_
  70.                 item['output'] = output
  71.                 item['system'] = system
  72.                 item['history'] = history
  73.  
  74.                 # string hashable
  75.                 item_str = json.dumps(item, sort_keys=True)
  76.                 if item_str not in seen:
  77.                     seen.add(item_str)
  78.                     combined.append(item)
  79.    
  80.     with open(output_file, 'w', encoding='utf-8') as out:
  81.         json.dump(combined, out, ensure_ascii=False, indent=2)
  82.  
  83. filenames = ['CapybaraPure-oasst2t1.json', 'Agent_no_robots.json', 'bagel-clean-v0.5.json', 'tulu-v2-openhermes2_5.json', 'wizard_vicuna_Persona.json']
  84. combine_files(filenames, 'combined.json')
  85. CapybaraPure-oasst2t1-Agent_no_robots-bagel-clean-v0.5-tulu-v2-openhermes2_5-wizard_vicuna_Persona.json
  86.  
  87. import json
  88.  
  89. def verify_json_structure(filename):
  90.     required_keys = ['instruction', 'input', 'output', 'system', 'history']
  91.  
  92.     with open(filename, 'r', encoding='utf-8-sig') as f:
  93.         try:
  94.             data = json.load(f)
  95.         except json.JSONDecodeError as e:
  96.             print(f"Invalid JSON file: {filename}")
  97.             print(f"Error: {e}")
  98.             return
  99.  
  100.         for i, item in enumerate(data, start=1):
  101.             missing_keys = [key for key in required_keys if key not in item]
  102.             if missing_keys:
  103.                 print(f"Missing keys in item {i}: {missing_keys}")
  104.  
  105. verify_json_structure('combined.json')
Advertisement
Add Comment
Please, Sign In to add comment