Guest User

Dedup_SFT-Nectar.py

a guest
Apr 19th, 2024
66
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.83 KB | None | 0 0
  1. import json
  2.  
  3. def remove_duplicates(input_filename, output_filename):
  4.     with open(input_filename, 'r', encoding='utf-8') as in_file:
  5.         data = json.load(in_file)
  6.  
  7.     unique_data = []
  8.  
  9.     for item in data:
  10.         # Check if there's an identical item in unique_data
  11.         identical_items = [d for d in unique_data if d['instruction'] == item['instruction'] and d['input'] == item['input'] and d['output'] == item['output']]
  12.         if identical_items:
  13.             continue
  14.  
  15.         # Check if there's an item with the same instruction and output but different input
  16.         same_io_items = [d for d in unique_data if d['instruction'] == item['instruction'] and d['output'] == item['output'] and d['input'] != item['input']]
  17.         if same_io_items:
  18.             # If the current item has a non-empty input and the existing item has an empty input, replace it
  19.             if item['input'] and not same_io_items[0]['input']:
  20.                 unique_data.remove(same_io_items[0])
  21.                 unique_data.append(item)
  22.             continue
  23.  
  24.         # Check if there's an item with the same instruction and input but shorter output
  25.         same_ii_items = [d for d in unique_data if d['instruction'] == item['instruction'] and d['input'] == item['input'] and len(d['output'].split()) < len(item['output'].split())]
  26.         if same_ii_items:
  27.             # If the current item has a longer output, replace it
  28.             unique_data.remove(same_ii_items[0])
  29.             unique_data.append(item)
  30.             continue
  31.  
  32.         # If none of the above conditions are met, add the item to unique_data
  33.         unique_data.append(item)
  34.  
  35.     with open(output_filename, 'w', encoding='utf-8') as out_file:
  36.         json.dump(unique_data, out_file, ensure_ascii=False, indent=4)
  37.  
  38. remove_duplicates('SFT-Nectar.json', 'Unique_SFT-Nectar.json')
  39.  
Advertisement
Add Comment
Please, Sign In to add comment