Dedup_SFT-Nectar.py

import json

def remove_duplicates(input_filename, output_filename):
    with open(input_filename, 'r', encoding='utf-8') as in_file:
        data = json.load(in_file)

    unique_data = []

    for item in data:
        # Check if there's an identical item in unique_data
        identical_items = [d for d in unique_data if d['instruction'] == item['instruction'] and d['input'] == item['input'] and d['output'] == item['output']]
        if identical_items:
            continue

        # Check if there's an item with the same instruction and output but different input
        same_io_items = [d for d in unique_data if d['instruction'] == item['instruction'] and d['output'] == item['output'] and d['input'] != item['input']]
        if same_io_items:
            # If the current item has a non-empty input and the existing item has an empty input, replace it
            if item['input'] and not same_io_items[0]['input']:
                unique_data.remove(same_io_items[0])
                unique_data.append(item)
            continue

        # Check if there's an item with the same instruction and input but shorter output
        same_ii_items = [d for d in unique_data if d['instruction'] == item['instruction'] and d['input'] == item['input'] and len(d['output'].split()) < len(item['output'].split())]
        if same_ii_items:
            # If the current item has a longer output, replace it
            unique_data.remove(same_ii_items[0])
            unique_data.append(item)
            continue

        # If none of the above conditions are met, add the item to unique_data
        unique_data.append(item)

    with open(output_filename, 'w', encoding='utf-8') as out_file:
        json.dump(unique_data, out_file, ensure_ascii=False, indent=4)

remove_duplicates('SFT-Nectar.json', 'Unique_SFT-Nectar.json')