Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import ijson
- import json
- from tqdm import tqdm
- def merge_dataset(directory, merged_file):
- # Get the list of all .json files in the directory and its subdirectories
- json_files = [os.path.join(root, file) for root, dirs, files in os.walk(directory) for file in files if file.endswith('.json')]
- with open(merged_file, 'w', encoding='utf-8') as out_file:
- for file_path in tqdm(json_files, desc="Merging files"):
- try:
- with open(file_path, 'rb') as in_file:
- objects = ijson.items(in_file, 'item') # Parse the JSON data using ijson to stream the file
- for entry in objects:
- # Create a new entry with only the "instruction", "input", and "output" fields, if they exist
- new_entry = {
- field: entry[field]
- for field in ["instruction", "input", "output"]
- if field in entry
- }
- new_entry_str = json.dumps(new_entry, ensure_ascii=False)
- out_file.write(new_entry_str + 'n')
- except (ijson.common.IncompleteJSONError, UnicodeDecodeError):
- print(f"Invalid JSON file or encoding issue: {file_path}")
- merge_dataset('AlcapCOTplus', 'Merged.json')
- # A. remove duplicated by String Compare:
- import ijson
- import os
- from tqdm import tqdm
- def deduplicate_file(input_file, output_file):
- unique_entries = set()
- total_lines = sum(1 for line in open(input_file, 'r', encoding='utf-8')) # total number of lines
- with open(input_file, 'r', encoding='utf-8') as in_file:
- with open(output_file, 'w', encoding='utf-8') as out_file:
- progress_bar = tqdm(total=total_lines, desc="Deduplicating") # total number of lines
- for line in in_file:
- # If the line is not in the set of unique entries, add it to the set and write it to the output file
- if line not in unique_entries:
- unique_entries.add(line)
- out_file.write(line)
- progress_bar.update(1)
- progress_bar.close()
- #deduplicate_file(r"C:pathtoMerged.json", 'Deduplicated.json')
- # B. remove duplicated by hashlib low-ram:
- import hashlib
- import collections
- from tqdm import tqdm
- def deduplicate_file(input_file, output_file):
- d = collections.defaultdict(list)
- total_lines = sum(1 for line in open(input_file, 'r', encoding='utf-8')) # total lines number
- with open(input_file, 'r', encoding='utf-8') as in_file:
- with open(output_file, 'w', encoding='utf-8') as out_file:
- progress_bar = tqdm(total=total_lines, desc="Processing")
- for line in in_file:
- # Convert the line to bytes
- line_bytes = line.encode('utf-8')
- id = hashlib.sha256(line_bytes).digest()
- # Use the first 2 bytes of the hash as the key
- k = id[0:2]
- # Use the rest of the hash as the value
- v = id[2:]
- # If the value is not in the dictionary under the key, it's unique
- if v not in d[k]:
- # Write the unique line to the output file
- out_file.write(line)
- # Add it to the dictionary
- d[k].append(v)
- progress_bar.update(1)
- progress_bar.close()
- #deduplicate_file(r"C:pathtoMerged.json", 'Deduplicated.json')
Advertisement
Add Comment
Please, Sign In to add comment