Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json
- from collections import defaultdict
- def deduplicate(file_path):
- with open(file_path, 'r', encoding='utf-8') as f:
- data = json.load(f)
- # Create a dictionary where the keys are 'output' values and the values are lists of items
- output_to_items = defaultdict(list)
- for item in data:
- output_to_items[item['output']].append(item)
- # Deduplicate
- deduplicated_data = []
- for output, items in output_to_items.items():
- if len(items) > 1:
- # Combine 'input' values and keep the longest 'instruction'
- combined_input = ', '.join(set(item['input'] for item in items))
- longest_instruction = max(items, key=lambda item: len(item['instruction']))['instruction']
- deduplicated_data.append({'instruction': longest_instruction, 'input': combined_input, 'output': output})
- else:
- # Keep the item as is if there are no duplicates
- deduplicated_data.append(items[0])
- # Write the deduplicated data back to the file
- with open(file_path, 'w', encoding='utf-8') as f:
- json.dump(deduplicated_data, f, ensure_ascii=False, indent=4)
- file_path = r"C:\SD\updated_updated_combined.json"
- deduplicate(file_path)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement