Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import json
- import numpy as np
- import hashlib
- def convert_parquet_to_json(input_file_path, output_file_path):
- df = pd.read_parquet(input_file_path)
- print(df.head())
- try:
- data_dict = df.to_dict('records')
- with open(output_file_path, 'w', encoding='utf-8') as f:
- json.dump(data_dict, f, ensure_ascii=False, indent=4)
- except TypeError:
- # Convert any numpy arrays to lists
- for record in data_dict:
- for key, value in record.items():
- if isinstance(value, np.ndarray):
- record[key] = value.tolist()
- with open(output_file_path, 'w', encoding='utf-8') as f:
- json.dump(data_dict, f, ensure_ascii=False, indent=4)
- with open(output_file_path, 'rb') as f:
- bytes = f.read()
- readable_hash = hashlib.sha1(bytes).hexdigest()
- print(f'SHA1: {readable_hash}')
- convert_parquet_to_json('input-00000-of-00001.parquet', 'output.json')
- # DPO:
- # combine alcapa-input_dpo
- import json
- import glob
- def add_history_field(data):
- for item in data:
- if 'history' not in item:
- item['history'] = []
- return data
- def combine_json_files(file_paths):
- combined_data = []
- for file_path in file_paths:
- with open(file_path, 'r', encoding='utf-8') as f:
- data = json.load(f)
- data = add_history_field(data)
- combined_data.extend(data)
- return combined_data
- def write_to_json_file(data, output_file_path):
- with open(output_file_path, 'w', encoding='utf-8') as f:
- json.dump(data, f, ensure_ascii=False, indent=4)
- def main():
- json_files = [
- 'distilabel-capybara-dpo-7k.json',
- 'Neural-DPO.json',
- 'Roleplay.json',
- 'Snorkel-Mistral-PairRM-DPO-Dataset.json',
- 'jondurbin_gutenberg-truthy-dpo.json',
- 'bagel-dpo-v0.5.json'
- ]
- combined_data = combine_json_files(json_files)
- write_to_json_file(combined_data, 'output.json')
- if __name__ == "__main__":
- main()
- # Skip the item if the chosen and rejected answers are the same
- import json
- def remove_duplicates(input_filename, output_filename):
- with open(input_filename, 'r', encoding='utf-8') as input_file:
- input_data = json.load(input_file)
- output_data = []
- seen = set()
- for item in input_data:
- if item["output"][0] == item["output"][1]:
- continue
- item_str = json.dumps(item, sort_keys=True)
- if item_str not in seen:
- output_data.append(item)
- seen.add(item_str)
- with open(output_filename, 'w', encoding='utf-8') as output_file:
- json.dump(output_data, output_file, ensure_ascii=False, indent=2)
- remove_duplicates('alcapa-input_dpo.json', 'output.json')
- # Skip the item if the rejected answer contains any unwanted words
- import json
- import re
- def remove_unwanted_output_objects(input_filename, output_filename, unwanted_words):
- refusals = [re.compile(re.escape(word), re.IGNORECASE) for word in unwanted_words]
- def contains_unwanted_words(text):
- return any(refusal.search(text) for refusal in refusals)
- with open(input_filename, 'r', encoding='utf-8') as input_file:
- input_data = json.load(input_file)
- output_data = []
- for item in input_data:
- if contains_unwanted_words(item["output"][0]):
- continue
- output_data.append(item)
- with open(output_filename, 'w', encoding='utf-8') as output_file:
- json.dump(output_data, output_file, ensure_ascii=False, indent=2)
- unwanted_words = ["text-based AI language model", .... "do not have the necessary information"]
- remove_unwanted_output_objects('alcapa-input_dpo.json', 'output.json', unwanted_words)
- # sft:
- # webglm-qa
- import json
- def convert_format(input_data):
- output_data = []
- for item in input_data:
- new_item = {}
- new_item["instruction"] = item["question"]
- new_item["input"] = "references:n" + "n".join([f"{i+1}. {ref}" for i, ref in enumerate(item["references"])])
- new_item["output"] = item["answer"]
- output_data.append(new_item)
- return output_data
- def convert_jsonl_file(input_filename, output_filename):
- output_data = []
- with open(input_filename, 'r', encoding='utf-8-sig') as input_file:
- for line in input_file:
- try:
- item = json.loads(line)
- output_data.extend(convert_format([item]))
- except json.JSONDecodeError:
- print(f"Skipping line due to JSONDecodeError: {line}")
- with open(output_filename, 'w', encoding='utf-8') as output_file:
- json.dump(output_data, output_file, ensure_ascii=False, indent=2)
- convert_jsonl_file('webglm-qa.jsonl', 'alcapa-output.json')
- # convert dolly
- import json
- def convert_format(input_data):
- output_data = []
- for item in input_data:
- new_item = {}
- new_item["instruction"] = item["instruction"]
- new_item["input"] = item["context"]
- new_item["output"] = item["response"]
- output_data.append(new_item)
- return output_data
- def convert_jsonl_file(input_filename, output_filename):
- output_data = []
- with open(input_filename, 'r', encoding='utf-8-sig') as input_file:
- for line in input_file:
- if not line.strip(): # Skip empty lines
- continue
- try:
- item = json.loads(line)
- output_data.extend(convert_format([item]))
- except json.JSONDecodeError:
- print(f"Skipping line due to JSONDecodeError: {line}")
- with open(output_filename, 'w', encoding='utf-8') as output_file:
- json.dump(output_data, output_file, ensure_ascii=False, indent=2)
- convert_jsonl_file('databricks-dolly-15k.jsonl', 'alcapa-output.json')
Advertisement
Add Comment
Please, Sign In to add comment