Advertisement
Guest User

dedup_promptSD.py

a guest
Dec 5th, 2023
25
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.24 KB | Source Code | 0 0
  1. import json
  2. from collections import defaultdict
  3.  
  4. def deduplicate(file_path):
  5.     with open(file_path, 'r', encoding='utf-8') as f:
  6.         data = json.load(f)
  7.  
  8.     # Create a dictionary where the keys are 'output' values and the values are lists of items
  9.     output_to_items = defaultdict(list)
  10.     for item in data:
  11.         output_to_items[item['output']].append(item)
  12.  
  13.     # Deduplicate
  14.     deduplicated_data = []
  15.     for output, items in output_to_items.items():
  16.         if len(items) > 1:
  17.             # Combine 'input' values and keep the longest 'instruction'
  18.             combined_input = ', '.join(set(item['input'] for item in items))
  19.             longest_instruction = max(items, key=lambda item: len(item['instruction']))['instruction']
  20.             deduplicated_data.append({'instruction': longest_instruction, 'input': combined_input, 'output': output})
  21.         else:
  22.             # Keep the item as is if there are no duplicates
  23.             deduplicated_data.append(items[0])
  24.  
  25.     # Write the deduplicated data back to the file
  26.     with open(file_path, 'w', encoding='utf-8') as f:
  27.         json.dump(deduplicated_data, f, ensure_ascii=False, indent=4)
  28.  
  29. file_path = r"C:\SD\updated_updated_combined.json"
  30. deduplicate(file_path)
  31.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement