Guest User

parquet2json_dpo2alcapa-DPO.py

a guest
Apr 23rd, 2024
95
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.87 KB | Source Code | 0 0
  1. import pandas as pd
  2. import json
  3. import numpy as np
  4. import hashlib
  5.  
  6. def convert_parquet_to_json(input_file_path, output_file_path):
  7.     df = pd.read_parquet(input_file_path)
  8.     print(df.head())
  9.  
  10.     try:
  11.         data_dict = df.to_dict('records')
  12.         with open(output_file_path, 'w', encoding='utf-8') as f:
  13.             json.dump(data_dict, f, ensure_ascii=False, indent=4)
  14.     except TypeError:
  15.         # Convert any numpy arrays to lists
  16.         for record in data_dict:
  17.             for key, value in record.items():
  18.                 if isinstance(value, np.ndarray):
  19.                     record[key] = value.tolist()
  20.         with open(output_file_path, 'w', encoding='utf-8') as f:
  21.             json.dump(data_dict, f, ensure_ascii=False, indent=4)
  22.  
  23.     with open(output_file_path, 'rb') as f:
  24.         bytes = f.read()
  25.         readable_hash = hashlib.sha1(bytes).hexdigest()
  26.         print(f'SHA1: {readable_hash}')
  27.  
  28. convert_parquet_to_json('input-00000-of-00001.parquet', 'output.json')
  29.  
  30.  
  31.  
  32.  
  33. # DPO:
  34. # combine alcapa-input_dpo
  35. import json
  36. import glob
  37.  
  38. def add_history_field(data):
  39.     for item in data:
  40.         if 'history' not in item:
  41.             item['history'] = []
  42.     return data
  43.  
  44. def combine_json_files(file_paths):
  45.     combined_data = []
  46.     for file_path in file_paths:
  47.         with open(file_path, 'r', encoding='utf-8') as f:
  48.             data = json.load(f)
  49.             data = add_history_field(data)
  50.             combined_data.extend(data)
  51.     return combined_data
  52.  
  53. def write_to_json_file(data, output_file_path):
  54.     with open(output_file_path, 'w', encoding='utf-8') as f:
  55.         json.dump(data, f, ensure_ascii=False, indent=4)
  56.  
  57. def main():
  58.     json_files = [
  59.         'distilabel-capybara-dpo-7k.json',
  60.         'Neural-DPO.json',
  61.         'Roleplay.json',
  62.         'Snorkel-Mistral-PairRM-DPO-Dataset.json',
  63.         'jondurbin_gutenberg-truthy-dpo.json',
  64.         'bagel-dpo-v0.5.json'
  65.  
  66.     ]
  67.     combined_data = combine_json_files(json_files)
  68.     write_to_json_file(combined_data, 'output.json')
  69.  
  70. if __name__ == "__main__":
  71.     main()
  72.  
  73.  
  74.  
  75.  
  76. # Skip the item if the chosen and rejected answers are the same
  77. import json
  78.  
  79. def remove_duplicates(input_filename, output_filename):
  80.     with open(input_filename, 'r', encoding='utf-8') as input_file:
  81.         input_data = json.load(input_file)
  82.  
  83.     output_data = []
  84.     seen = set()
  85.     for item in input_data:
  86.         if item["output"][0] == item["output"][1]:
  87.             continue
  88.         item_str = json.dumps(item, sort_keys=True)
  89.         if item_str not in seen:
  90.             output_data.append(item)
  91.             seen.add(item_str)
  92.     with open(output_filename, 'w', encoding='utf-8') as output_file:
  93.         json.dump(output_data, output_file, ensure_ascii=False, indent=2)
  94.  
  95. remove_duplicates('alcapa-input_dpo.json', 'output.json')
  96.  
  97.  
  98.  
  99.  
  100. # Skip the item if the rejected answer contains any unwanted words
  101. import json
  102. import re
  103.  
  104. def remove_unwanted_output_objects(input_filename, output_filename, unwanted_words):
  105.     refusals = [re.compile(re.escape(word), re.IGNORECASE) for word in unwanted_words]
  106.     def contains_unwanted_words(text):
  107.         return any(refusal.search(text) for refusal in refusals)
  108.     with open(input_filename, 'r', encoding='utf-8') as input_file:
  109.         input_data = json.load(input_file)
  110.  
  111.     output_data = []
  112.     for item in input_data:
  113.         if contains_unwanted_words(item["output"][0]):
  114.             continue
  115.         output_data.append(item)
  116.  
  117.     with open(output_filename, 'w', encoding='utf-8') as output_file:
  118.         json.dump(output_data, output_file, ensure_ascii=False, indent=2)
  119.  
  120. unwanted_words = ["text-based AI language model", .... "do not have the necessary information"]
  121. remove_unwanted_output_objects('alcapa-input_dpo.json', 'output.json', unwanted_words)
  122.  
  123.  
  124.  
  125.  
  126.  
  127. # sft:
  128. # webglm-qa
  129. import json
  130.  
  131. def convert_format(input_data):
  132.     output_data = []
  133.     for item in input_data:
  134.         new_item = {}
  135.         new_item["instruction"] = item["question"]
  136.         new_item["input"] = "references:n" + "n".join([f"{i+1}. {ref}" for i, ref in enumerate(item["references"])])
  137.         new_item["output"] = item["answer"]
  138.         output_data.append(new_item)
  139.     return output_data
  140.  
  141. def convert_jsonl_file(input_filename, output_filename):
  142.     output_data = []
  143.     with open(input_filename, 'r', encoding='utf-8-sig') as input_file:
  144.         for line in input_file:
  145.             try:
  146.                 item = json.loads(line)
  147.                 output_data.extend(convert_format([item]))
  148.             except json.JSONDecodeError:
  149.                 print(f"Skipping line due to JSONDecodeError: {line}")
  150.     with open(output_filename, 'w', encoding='utf-8') as output_file:
  151.         json.dump(output_data, output_file, ensure_ascii=False, indent=2)
  152.  
  153. convert_jsonl_file('webglm-qa.jsonl', 'alcapa-output.json')
  154.  
  155.  
  156. # convert dolly
  157. import json
  158.  
  159. def convert_format(input_data):
  160.     output_data = []
  161.     for item in input_data:
  162.         new_item = {}
  163.         new_item["instruction"] = item["instruction"]
  164.         new_item["input"] = item["context"]
  165.         new_item["output"] = item["response"]
  166.         output_data.append(new_item)
  167.     return output_data
  168.  
  169. def convert_jsonl_file(input_filename, output_filename):
  170.     output_data = []
  171.     with open(input_filename, 'r', encoding='utf-8-sig') as input_file:
  172.         for line in input_file:
  173.             if not line.strip(): # Skip empty lines
  174.                 continue
  175.             try:
  176.                 item = json.loads(line)
  177.                 output_data.extend(convert_format([item]))
  178.             except json.JSONDecodeError:
  179.                 print(f"Skipping line due to JSONDecodeError: {line}")
  180.     with open(output_filename, 'w', encoding='utf-8') as output_file:
  181.         json.dump(output_data, output_file, ensure_ascii=False, indent=2)
  182.  
  183. convert_jsonl_file('databricks-dolly-15k.jsonl', 'alcapa-output.json')
Advertisement
Add Comment
Please, Sign In to add comment