remove_Duplicated_Dataset_CoT_Alcapa.py

import os
import ijson
import json
from tqdm import tqdm

def merge_dataset(directory, merged_file):
    # Get the list of all .json files in the directory and its subdirectories
    json_files = [os.path.join(root, file) for root, dirs, files in os.walk(directory) for file in files if file.endswith('.json')]
    with open(merged_file, 'w', encoding='utf-8') as out_file:
        for file_path in tqdm(json_files, desc="Merging files"):
            try:
                with open(file_path, 'rb') as in_file:
                    objects = ijson.items(in_file, 'item') # Parse the JSON data using ijson to stream the file
                    for entry in objects:
                        # Create a new entry with only the "instruction", "input", and "output" fields, if they exist
                        new_entry = {
                            field: entry[field]
                            for field in ["instruction", "input", "output"]
                            if field in entry
                        }
                        new_entry_str = json.dumps(new_entry, ensure_ascii=False)
                        out_file.write(new_entry_str + 'n')

            except (ijson.common.IncompleteJSONError, UnicodeDecodeError):
                print(f"Invalid JSON file or encoding issue: {file_path}")

merge_dataset('AlcapCOTplus', 'Merged.json')


# A. remove duplicated by String Compare:
import ijson
import os
from tqdm import tqdm

def deduplicate_file(input_file, output_file):
    unique_entries = set()
    total_lines = sum(1 for line in open(input_file, 'r', encoding='utf-8')) # total number of lines
    with open(input_file, 'r', encoding='utf-8') as in_file:
        with open(output_file, 'w', encoding='utf-8') as out_file:
            progress_bar = tqdm(total=total_lines, desc="Deduplicating") # total number of lines
            for line in in_file:
                # If the line is not in the set of unique entries, add it to the set and write it to the output file
                if line not in unique_entries:
                    unique_entries.add(line)
                    out_file.write(line)

                progress_bar.update(1)
    progress_bar.close()

#deduplicate_file(r"C:pathtoMerged.json", 'Deduplicated.json')

# B. remove duplicated by hashlib low-ram:
import hashlib
import collections
from tqdm import tqdm

def deduplicate_file(input_file, output_file):
    d = collections.defaultdict(list)
    total_lines = sum(1 for line in open(input_file, 'r', encoding='utf-8')) # total lines number
    with open(input_file, 'r', encoding='utf-8') as in_file:
        with open(output_file, 'w', encoding='utf-8') as out_file:
            progress_bar = tqdm(total=total_lines, desc="Processing")

            for line in in_file:
                # Convert the line to bytes
                line_bytes = line.encode('utf-8')
                id = hashlib.sha256(line_bytes).digest()
                # Use the first 2 bytes of the hash as the key
                k = id[0:2]
                # Use the rest of the hash as the value
                v = id[2:]
                # If the value is not in the dictionary under the key, it's unique
                if v not in d[k]:
                    # Write the unique line to the output file
                    out_file.write(line)
                    # Add it to the dictionary
                    d[k].append(v)

                progress_bar.update(1)

    progress_bar.close()

#deduplicate_file(r"C:pathtoMerged.json", 'Deduplicated.json')