Guest User

remove_Duplicated_Dataset_CoT_Alcapa.py

a guest
Apr 20th, 2024
74
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.53 KB | Source Code | 0 0
  1. import os
  2. import ijson
  3. import json
  4. from tqdm import tqdm
  5.  
  6. def merge_dataset(directory, merged_file):
  7.     # Get the list of all .json files in the directory and its subdirectories
  8.     json_files = [os.path.join(root, file) for root, dirs, files in os.walk(directory) for file in files if file.endswith('.json')]
  9.     with open(merged_file, 'w', encoding='utf-8') as out_file:
  10.         for file_path in tqdm(json_files, desc="Merging files"):
  11.             try:
  12.                 with open(file_path, 'rb') as in_file:
  13.                     objects = ijson.items(in_file, 'item') # Parse the JSON data using ijson to stream the file
  14.                     for entry in objects:
  15.                         # Create a new entry with only the "instruction", "input", and "output" fields, if they exist
  16.                         new_entry = {
  17.                             field: entry[field]
  18.                             for field in ["instruction", "input", "output"]
  19.                             if field in entry
  20.                         }
  21.                         new_entry_str = json.dumps(new_entry, ensure_ascii=False)
  22.                         out_file.write(new_entry_str + 'n')
  23.  
  24.             except (ijson.common.IncompleteJSONError, UnicodeDecodeError):
  25.                 print(f"Invalid JSON file or encoding issue: {file_path}")
  26.  
  27. merge_dataset('AlcapCOTplus', 'Merged.json')
  28.  
  29.  
  30.  
  31. # A. remove duplicated by String Compare:
  32. import ijson
  33. import os
  34. from tqdm import tqdm
  35.  
  36. def deduplicate_file(input_file, output_file):
  37.     unique_entries = set()    
  38.     total_lines = sum(1 for line in open(input_file, 'r', encoding='utf-8')) # total number of lines
  39.     with open(input_file, 'r', encoding='utf-8') as in_file:
  40.         with open(output_file, 'w', encoding='utf-8') as out_file:
  41.             progress_bar = tqdm(total=total_lines, desc="Deduplicating") # total number of lines
  42.             for line in in_file:
  43.                 # If the line is not in the set of unique entries, add it to the set and write it to the output file
  44.                 if line not in unique_entries:
  45.                     unique_entries.add(line)
  46.                     out_file.write(line)
  47.  
  48.                 progress_bar.update(1)
  49.     progress_bar.close()
  50.  
  51. #deduplicate_file(r"C:pathtoMerged.json", 'Deduplicated.json')
  52.  
  53. # B. remove duplicated by hashlib low-ram:
  54. import hashlib
  55. import collections
  56. from tqdm import tqdm
  57.  
  58. def deduplicate_file(input_file, output_file):
  59.     d = collections.defaultdict(list)
  60.     total_lines = sum(1 for line in open(input_file, 'r', encoding='utf-8')) # total lines number
  61.     with open(input_file, 'r', encoding='utf-8') as in_file:
  62.         with open(output_file, 'w', encoding='utf-8') as out_file:
  63.             progress_bar = tqdm(total=total_lines, desc="Processing")
  64.  
  65.             for line in in_file:
  66.                 # Convert the line to bytes
  67.                 line_bytes = line.encode('utf-8')
  68.                 id = hashlib.sha256(line_bytes).digest()
  69.                 # Use the first 2 bytes of the hash as the key
  70.                 k = id[0:2]
  71.                 # Use the rest of the hash as the value
  72.                 v = id[2:]
  73.                 # If the value is not in the dictionary under the key, it's unique
  74.                 if v not in d[k]:
  75.                     # Write the unique line to the output file
  76.                     out_file.write(line)
  77.                     # Add it to the dictionary
  78.                     d[k].append(v)
  79.  
  80.                 progress_bar.update(1)
  81.  
  82.     progress_bar.close()
  83.  
  84. #deduplicate_file(r"C:pathtoMerged.json", 'Deduplicated.json')
Advertisement
Add Comment
Please, Sign In to add comment