Guest User

vicuna_no-robot_persona2sharegpt.py

a guest
Apr 19th, 2024
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.88 KB | Source Code | 0 0
  1. # convert_google_dataset_New-Persona-New-Conversations_to_sharegpty.py
  2. import csv
  3. import json
  4.  
  5. with open('New-Persona-New-Conversations.csv', 'r', encoding='utf-8') as f:
  6.     reader = csv.reader(f)
  7.     next(reader)  # Skip the header
  8.     new_data = []
  9.  
  10.     for row in reader:
  11.         user_1_personas = row[0].replace('n', 'n').replace('""', '"')
  12.         user_2_personas = row[1].replace('n', 'n').replace('""', '"')
  13.         system_message = f"Generate the next reply in this role-play chat based on the following personas. User personas: {user_1_personas}.nnYou have personas: {user_2_personas}."
  14.  
  15.         conversations = []
  16.         conversation_parts = row[2].split('n')
  17.         for part in conversation_parts:
  18.             if part.startswith("User 1:"):
  19.                 role = "human"
  20.                 content = part[len("User 1:"):].strip()
  21.             else:  # part.startswith("User 2:")
  22.                 role = "gpt"
  23.                 content = part[len("User 2:"):].strip()
  24.             conversations.append({"from": role, "value": content})
  25.  
  26.         new_data.append({"conversations": conversations, "system": system_message})
  27.  
  28. with open('New-Persona-New-Conversations.jsonl', 'w', encoding='utf-8') as f:
  29.     for entry in new_data:
  30.         f.write(json.dumps(entry, ensure_ascii=False) + 'n')
  31.  
  32.  
  33.  
  34.  
  35.  
  36.  
  37.  
  38.  
  39. # no-robotDataset2sharegpt.py
  40. import json
  41.  
  42. with open('train_sft.json', 'r', encoding='utf-8') as f:
  43.     data = json.load(f)
  44. new_data = []
  45. for entry in data:
  46.     system_message = ""
  47.     conversations = []
  48.    
  49.     for msg in entry["messages"]:
  50.         if msg["role"] == "system":
  51.             system_message = msg["content"]
  52.             continue
  53.        
  54.         role_map = {"user": "human", "assistant": "gpt"} # Map the original roles to the new roles
  55.         role = role_map.get(msg["role"], msg["role"])
  56.        
  57.         conversations.append({"from": role, "value": msg["content"]}) # Add the message to the conversations
  58.    
  59.     if not system_message: # If there's no 'system' message, use an empty string
  60.         system_message = ""
  61.    
  62.     new_data.append({"conversations": conversations, "system": system_message})
  63.  
  64. # Save .jsonl format
  65. with open('train_sft.jsonl', 'w', encoding='utf-8') as f:
  66.     for entry in new_data:
  67.         f.write(json.dumps(entry, ensure_ascii=False) + 'n')
  68.  
  69.  
  70.  
  71.  
  72.  
  73.  
  74.  
  75.  
  76. # wizard_vicuna_jsonl.py
  77. import json
  78.  
  79. def convert_json_to_jsonl(input_filename, output_filename):
  80.     with open(input_filename, 'r', encoding='utf-8') as in_file:
  81.         data = json.load(in_file)
  82.  
  83.     with open(output_filename, 'w', encoding='utf-8') as out_file:
  84.         for item in data:
  85.             item.pop('id', None) # Exclude the 'id' field
  86.             json.dump(item, out_file, ensure_ascii=False)
  87.             out_file.write('n')
  88.  
  89. convert_json_to_jsonl('wizard_vicuna_dataset_unfiltered.json', 'wizard_vicuna_dataset_unfiltered.jsonl') # sharegpt
Advertisement
Add Comment
Please, Sign In to add comment