Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json
- def combine_files(file1, file2, output_file):
- combined = []
- for filename in [file1, file2]:
- with open(filename, 'r', encoding='utf-8-sig') as f:
- for i, line in enumerate(f, start=1):
- try:
- jsonl = json.loads(line)
- conversations = jsonl["conversations"]
- if len(conversations) < 2:
- print(f"Conversation with less than 2 elements in file {filename} on line {i}: {conversations}")
- continue
- history = []
- for j in range(0, len(conversations)-2, 2):
- if conversations[j+1]["value"] == "":
- print(f"Unpaired QA in file {filename} on line {i}: Question: {conversations[j]['value']}, Answer: {conversations[j+1]['value']}")
- continue
- history.append([conversations[j]["value"], conversations[j+1]["value"]])
- combined_data = {
- "instruction": conversations[-2]["value"],
- "input": "",
- "output": conversations[-1]["value"],
- "system": jsonl.get("system", ""),
- "history": history
- }
- combined.append(combined_data)
- except (json.JSONDecodeError, KeyError) as e:
- print(f"Error in file {filename} on line {i}: {line}")
- print(f"Conversation: {conversations}")
- raise e
- with open(output_file, 'w', encoding='utf-8') as out:
- json.dump(combined, out, ensure_ascii=False, indent=2)
- combine_files('file1.jsonl', 'file2.jsonl', 'output-.json')
- import json
- def combine_files(filenames, output_file):
- combined = []
- seen = set()
- for filename in filenames:
- with open(filename, 'r', encoding='utf-8-sig') as f:
- data = json.load(f)
- for item in data:
- instruction = item.get('instruction', '').strip()
- input_ = item.get('input', '').strip()
- output = item.get('output', '').strip()
- system = item.get('system', '').strip()
- # default value for 'history'
- history = item.get('history', [])
- if not output:
- continue
- if not instruction and not input_:
- continue
- if not all(key in item for key in ['instruction', 'input', 'output', 'system', 'history']):
- print(f"Missing key in item: {item}")
- continue
- item['instruction'] = instruction
- item['input'] = input_
- item['output'] = output
- item['system'] = system
- item['history'] = history
- # string hashable
- item_str = json.dumps(item, sort_keys=True)
- if item_str not in seen:
- seen.add(item_str)
- combined.append(item)
- with open(output_file, 'w', encoding='utf-8') as out:
- json.dump(combined, out, ensure_ascii=False, indent=2)
- filenames = ['CapybaraPure-oasst2t1.json', 'Agent_no_robots.json', 'bagel-clean-v0.5.json', 'tulu-v2-openhermes2_5.json', 'wizard_vicuna_Persona.json']
- combine_files(filenames, 'combined.json')
- CapybaraPure-oasst2t1-Agent_no_robots-bagel-clean-v0.5-tulu-v2-openhermes2_5-wizard_vicuna_Persona.json
- import json
- def verify_json_structure(filename):
- required_keys = ['instruction', 'input', 'output', 'system', 'history']
- with open(filename, 'r', encoding='utf-8-sig') as f:
- try:
- data = json.load(f)
- except json.JSONDecodeError as e:
- print(f"Invalid JSON file: {filename}")
- print(f"Error: {e}")
- return
- for i, item in enumerate(data, start=1):
- missing_keys = [key for key in required_keys if key not in item]
- if missing_keys:
- print(f"Missing keys in item {i}: {missing_keys}")
- verify_json_structure('combined.json')
Advertisement
Add Comment
Please, Sign In to add comment