vicuna_no-robot_persona2sharegpt.py

# convert_google_dataset_New-Persona-New-Conversations_to_sharegpty.py
import csv
import json

with open('New-Persona-New-Conversations.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    next(reader)  # Skip the header
    new_data = []

    for row in reader:
        user_1_personas = row[0].replace('n', 'n').replace('""', '"')
        user_2_personas = row[1].replace('n', 'n').replace('""', '"')
        system_message = f"Generate the next reply in this role-play chat based on the following personas. User personas: {user_1_personas}.nnYou have personas: {user_2_personas}."

        conversations = []
        conversation_parts = row[2].split('n')
        for part in conversation_parts:
            if part.startswith("User 1:"):
                role = "human"
                content = part[len("User 1:"):].strip()
            else:  # part.startswith("User 2:")
                role = "gpt"
                content = part[len("User 2:"):].strip()
            conversations.append({"from": role, "value": content})

        new_data.append({"conversations": conversations, "system": system_message})

with open('New-Persona-New-Conversations.jsonl', 'w', encoding='utf-8') as f:
    for entry in new_data:
        f.write(json.dumps(entry, ensure_ascii=False) + 'n')


# no-robotDataset2sharegpt.py
import json

with open('train_sft.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
new_data = []
for entry in data:
    system_message = ""
    conversations = []

    for msg in entry["messages"]:
        if msg["role"] == "system":
            system_message = msg["content"]
            continue

        role_map = {"user": "human", "assistant": "gpt"} # Map the original roles to the new roles
        role = role_map.get(msg["role"], msg["role"])

        conversations.append({"from": role, "value": msg["content"]}) # Add the message to the conversations

    if not system_message: # If there's no 'system' message, use an empty string
        system_message = ""

    new_data.append({"conversations": conversations, "system": system_message})

# Save .jsonl format
with open('train_sft.jsonl', 'w', encoding='utf-8') as f:
    for entry in new_data:
        f.write(json.dumps(entry, ensure_ascii=False) + 'n')


# wizard_vicuna_jsonl.py
import json

def convert_json_to_jsonl(input_filename, output_filename):
    with open(input_filename, 'r', encoding='utf-8') as in_file:
        data = json.load(in_file)

    with open(output_filename, 'w', encoding='utf-8') as out_file:
        for item in data:
            item.pop('id', None) # Exclude the 'id' field
            json.dump(item, out_file, ensure_ascii=False)
            out_file.write('n')

convert_json_to_jsonl('wizard_vicuna_dataset_unfiltered.json', 'wizard_vicuna_dataset_unfiltered.jsonl') # sharegpt