Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # convert_google_dataset_New-Persona-New-Conversations_to_sharegpty.py
- import csv
- import json
- with open('New-Persona-New-Conversations.csv', 'r', encoding='utf-8') as f:
- reader = csv.reader(f)
- next(reader) # Skip the header
- new_data = []
- for row in reader:
- user_1_personas = row[0].replace('n', 'n').replace('""', '"')
- user_2_personas = row[1].replace('n', 'n').replace('""', '"')
- system_message = f"Generate the next reply in this role-play chat based on the following personas. User personas: {user_1_personas}.nnYou have personas: {user_2_personas}."
- conversations = []
- conversation_parts = row[2].split('n')
- for part in conversation_parts:
- if part.startswith("User 1:"):
- role = "human"
- content = part[len("User 1:"):].strip()
- else: # part.startswith("User 2:")
- role = "gpt"
- content = part[len("User 2:"):].strip()
- conversations.append({"from": role, "value": content})
- new_data.append({"conversations": conversations, "system": system_message})
- with open('New-Persona-New-Conversations.jsonl', 'w', encoding='utf-8') as f:
- for entry in new_data:
- f.write(json.dumps(entry, ensure_ascii=False) + 'n')
- # no-robotDataset2sharegpt.py
- import json
- with open('train_sft.json', 'r', encoding='utf-8') as f:
- data = json.load(f)
- new_data = []
- for entry in data:
- system_message = ""
- conversations = []
- for msg in entry["messages"]:
- if msg["role"] == "system":
- system_message = msg["content"]
- continue
- role_map = {"user": "human", "assistant": "gpt"} # Map the original roles to the new roles
- role = role_map.get(msg["role"], msg["role"])
- conversations.append({"from": role, "value": msg["content"]}) # Add the message to the conversations
- if not system_message: # If there's no 'system' message, use an empty string
- system_message = ""
- new_data.append({"conversations": conversations, "system": system_message})
- # Save .jsonl format
- with open('train_sft.jsonl', 'w', encoding='utf-8') as f:
- for entry in new_data:
- f.write(json.dumps(entry, ensure_ascii=False) + 'n')
- # wizard_vicuna_jsonl.py
- import json
- def convert_json_to_jsonl(input_filename, output_filename):
- with open(input_filename, 'r', encoding='utf-8') as in_file:
- data = json.load(in_file)
- with open(output_filename, 'w', encoding='utf-8') as out_file:
- for item in data:
- item.pop('id', None) # Exclude the 'id' field
- json.dump(item, out_file, ensure_ascii=False)
- out_file.write('n')
- convert_json_to_jsonl('wizard_vicuna_dataset_unfiltered.json', 'wizard_vicuna_dataset_unfiltered.jsonl') # sharegpt
Advertisement
Add Comment
Please, Sign In to add comment