Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json
- def remove_duplicates(input_filename, output_filename):
- with open(input_filename, 'r', encoding='utf-8') as in_file:
- data = json.load(in_file)
- unique_data = []
- for item in data:
- # Check if there's an identical item in unique_data
- identical_items = [d for d in unique_data if d['instruction'] == item['instruction'] and d['input'] == item['input'] and d['output'] == item['output']]
- if identical_items:
- continue
- # Check if there's an item with the same instruction and output but different input
- same_io_items = [d for d in unique_data if d['instruction'] == item['instruction'] and d['output'] == item['output'] and d['input'] != item['input']]
- if same_io_items:
- # If the current item has a non-empty input and the existing item has an empty input, replace it
- if item['input'] and not same_io_items[0]['input']:
- unique_data.remove(same_io_items[0])
- unique_data.append(item)
- continue
- # Check if there's an item with the same instruction and input but shorter output
- same_ii_items = [d for d in unique_data if d['instruction'] == item['instruction'] and d['input'] == item['input'] and len(d['output'].split()) < len(item['output'].split())]
- if same_ii_items:
- # If the current item has a longer output, replace it
- unique_data.remove(same_ii_items[0])
- unique_data.append(item)
- continue
- # If none of the above conditions are met, add the item to unique_data
- unique_data.append(item)
- with open(output_filename, 'w', encoding='utf-8') as out_file:
- json.dump(unique_data, out_file, ensure_ascii=False, indent=4)
- remove_duplicates('SFT-Nectar.json', 'Unique_SFT-Nectar.json')
Advertisement
Add Comment
Please, Sign In to add comment