Untitled

"""
LlamaThink-8b-Instruct Finetuning Process

I recently created LlamaThink-8b-Instruct Full Instruct model: https://huggingface.co/DavidBrowne17/LlamaThink-8B-instruct

GGUF: LlamaThink-8b-Instruct-GGUF: https://huggingface.co/DavidBrowne17/LlamaThink-8B-instruct-GGUF

and a few of you were curious as to how I made it, here is the process to finetune a model with GRPO reinforcement learning.

So our goal is to make a thinker model, its super easy, first we need a dataset. Here is a script for llama cpp python to create a dataset.
"""


import json
import gc
import random
import re
from llama_cpp import Llama
import textwrap

MODEL_PATHS = [
    "YOUR MODEL GGUF HERE"
]

OUTPUT_FILE = "./enhanced_simple_dataset.jsonl"

NUM_CONVERSATIONS = 5000
TURNS_PER_CONVO = 1
MAX_TOKENS = 100

STOP_TOKENS = [
    "</s>", "<|endoftext|>", "<<USR>>", "<</USR>>", "<</SYS>>", "<</USER>>",
    "<</ASSISTANT>>", "<|eot_id|>", "<|im_end|>", "user:", "User:", "user :",
    "User :", "[assistant]", "[[assistant]]", "[user]", "[[user]]",
    "[/assistant]", "[/user]", "[\\assistant]"
]

USER_INSTRUCTION = (
    "You are engaging in a conversation with an AI designed for deep reasoning and structured thinking. "
    "Ask questions naturally while expecting insightful, multi-layered responses. "
    "Ask a unique, relevant question. "
    "Keep messages clear and concise. Respond only with the Question, nothing else."
)

INSTRUCTIONS = {
    "system_prompt": textwrap.dedent(""" Generate a system prompt for an AI to follow. This is a prompt for how the AI should behave, e.g., You are a chatbot, assistant, maths teacher, etc. It should not be instructions for a specific task. Do not add any explanations, headers, or formatting. Only output the system prompt text. """).strip(),

    "thinking": (
        "You are an AI designed to think deeply about the conversation topic. "
        "This is your internal thought process which is not visible to the user. "
        "Explain to yourself how you figure out the answer. "
        "Consider the user's question carefully, analyze the context, and formulate a coherent response strategy. "
        "Ensure your thought process is logical and well-structured. Do not generate any headers."
    ),

    "final": (
        "You are the final reviewer ensuring the response meets high standards of quality and insight. "
        "Your goal is to:\n"
        "1. Maximize logical depth and engagement.\n"
        "2. Ensure the response is precise, well-reasoned, and helpful.\n"
        "3. Strengthen structured argumentation and clarity.\n"
        "4. Maintain a professional and well-organized tone.\n"
        "In your final response, reference the user-provided system prompt to ensure consistency and relevance. "
        "Be concise and give the final answer."
    )
}

def load_model(path):
    """Loads a single model."""
    try:
        return Llama(model_path=path, n_ctx=16000, n_gpu_layers=-1, chat_format="llama-3")
    except Exception as e:
        print(f"Failed to load model {path}: {e}")
        return None

def call_model(llm, messages):
    """Calls the model using chat completion API and retries on failure."""
    attempt = 0
    while True:
        attempt += 1
        try:
            result = llm.create_chat_completion(
                messages=messages,
                max_tokens=MAX_TOKENS,
                temperature=random.uniform(1.4, 1.7),
                top_k=random.choice([250, 350]),
                top_p=random.uniform(0.85, 0.95),
                seed=random.randint(1, 900000000),
                stop=STOP_TOKENS
            )
            response_text = result["choices"][0]["message"]["content"].strip()
            if response_text:
                return response_text
            else:
                print(f"Attempt {attempt}: Empty response. Retrying...")
        except ValueError as e:
            print(f"Attempt {attempt}: Model call error: {e}. Retrying...")
        except KeyboardInterrupt:
            print("\nManual interruption detected. Exiting retry loop.")
            return "Error: Retry loop interrupted by user."
        except Exception as e:
            print(f"Unexpected error on attempt {attempt}: {e}. Retrying...")

def generate_system_prompt(llm):
    messages = [{"role": "system", "content": INSTRUCTIONS["system_prompt"]}]
    return call_model(llm, messages)

def generate_user_message(llm, system_prompt):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": USER_INSTRUCTION}
    ]
    return call_model(llm, messages)

def trim_to_last_complete_sentence(text):
    """Trims text to the last complete sentence."""
    matches = list(re.finditer(r'[.!?]', text))
    return text[:matches[-1].end()] if matches else text

def generate_response(llm, conversation_history, system_prompt):
    thinking = call_model(llm, [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": INSTRUCTIONS["thinking"]}
    ])

    final_response = call_model(llm, [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": INSTRUCTIONS["final"]}
    ])

    return f"<thinking>{trim_to_last_complete_sentence(thinking)}</thinking>\n\n<answer>{trim_to_last_complete_sentence(final_response)}</answer>"

def format_conversation(conversation):
    return "\n".join(f"{entry['role']}: {entry['content']}" for entry in conversation)

def generate_conversation(llm):
    conversation = []
    system_prompt = generate_system_prompt(llm)

    for _ in range(TURNS_PER_CONVO):
        user_message_text = generate_user_message(llm, system_prompt)
        conversation.append({"role": "user", "content": user_message_text})

        conv_history_str = format_conversation(conversation)
        assistant_message_text = generate_response(llm, conv_history_str, system_prompt)
        conversation.append({"role": "assistant", "content": assistant_message_text})

    return system_prompt, conversation

def validate_json(data):
    """Ensures JSON is valid before writing."""
    try:
        json.loads(json.dumps(data))
        return True
    except json.JSONDecodeError as e:
        print(f"Invalid JSON detected: {e}")
        return False

def main():
    llm = load_model(MODEL_PATHS[0])
    if not llm:
        print("Failed to load the model. Exiting.")
        return

    with open(OUTPUT_FILE, "a", encoding="utf-8") as out_f:
        for convo_idx in range(NUM_CONVERSATIONS):
            system_prompt, conversation = generate_conversation(llm)

            json_output = {
                "instruction": system_prompt.strip(),
                "conversation": conversation
            }

            if validate_json(json_output):
                json_string = json.dumps(json_output, ensure_ascii=False)
                out_f.write(json_string + "\n")
            else:
                print(f"Skipping malformed JSON for conversation {convo_idx}")

            if convo_idx % 100 == 0:
                print(f"Wrote conversation {convo_idx}/{NUM_CONVERSATIONS}")

    del llm
    gc.collect()

    print(f"Dataset complete: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()

##########################################################################################################################
# grpo.py

"""
I set the limit to 5000 but we really only need about 300 results to finetune our model. I highly recommend changing the prompts slightly as you get more useful data, to get a more diverse dataset, This will improve your final results. Tell it to be a mathematician, historian etc. and to ask complex advanced questions.

Once the dataset is ready, install unsloth. Once your install is done you can create a new file called grpo.py which contains the following code, once the dataset is ready, place it in the same directory as the grpo.py file in the unsloth folder.
"""

import sys
import os
import re
import torch
from typing import List
from sentence_transformers import SentenceTransformer
import numpy as np

embedder = SentenceTransformer("all-MiniLM-L6-v2")
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

if sys.platform == "win32":
    import types
    resource = types.ModuleType("resource")
    resource.getrlimit = lambda resource_id: (0, 0)
    resource.setrlimit = lambda resource_id, limits: None
    sys.modules["resource"] = resource

from unsloth import FastLanguageModel, PatchFastRL, is_bfloat16_supported
PatchFastRL("GRPO", FastLanguageModel)
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, PeftModel

# Configuration
MAX_SEQ_LENGTH = 256
LORA_RANK = 16
BASE_MODEL_NAME = "unsloth/Meta-Llama-3.1-8B-instruct"
DATASET_PATH = "enhanced_simple_dataset.jsonl"
ADAPTER_SAVE_PATH = "grpo_adapter"
MERGED_MODEL_PATH = "merged_grpo_full"
SYSTEM_PROMPT = """ Respond in the following format: <thinking> ... </thinking> <answer> ... </answer> The thinking and answer portions should be no more than 100 tokens each. """

def format_dataset_entry(example):
    """Format dataset entries for GRPO training."""
    system_prompt = example.get("instruction", "")
    conversation = example.get("conversation", [])

    messages = [{"role": "system", "content": system_prompt + SYSTEM_PROMPT}]

    if conversation and conversation[-1].get("role") == "assistant":
        for turn in conversation[:-1]:
            messages.append(turn)
        answer = conversation[-1].get("content", "")
    else:
        for turn in conversation:
            messages.append(turn)
        answer = ""

    return {"prompt": messages, "answer": answer}

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]

    print('-' * 20,
          f"Question:\n{q}",
          f"\nAnswer:\n{answer[0]}",
          f"\nResponse:\n{responses[0]}",
          f"\nExtracted:\n{extracted_responses[0]}")

    # Compute embeddings and cosine similarity
    answer_embedding = embedder.encode(answer, convert_to_numpy=True)
    response_embeddings = embedder.encode(extracted_responses, convert_to_numpy=True)

    similarities = [np.dot(r, answer_embedding) / (np.linalg.norm(r) * np.linalg.norm(answer_embedding))
                    for r in response_embeddings]

    # Convert similarity to reward (scaled 0-2 range)
    return [max(0.0, min(2.0, s * 2)) for s in similarities]

def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, *kwargs) -> list[float]:
    pattern = r"<thinking>\n.?\n</thinking>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, *kwargs) -> list[float]:
    pattern = r"<thinking>.?</thinking>\s<answer>.?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text) -> float:
    count = 0.0
    if text.count("<thinking>\n") == 1:
        count += 0.125
    if text.count("\n</thinking>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
    count -= len(text.split("\n</answer>\n")[-1]) * 0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
    count -= (len(text.split("\n</answer>")[-1]) - 1) * 0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

def main():
    print("Loading model and tokenizer...")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=BASE_MODEL_NAME,
        max_seq_length=MAX_SEQ_LENGTH,
        load_in_4bit=True,
        fast_inference=False,
        max_lora_rank=LORA_RANK,
        gpu_memory_utilization=0.9,
        device_map={"": torch.cuda.current_device()}
    )

    print("Applying GRPO adapter...")

    lora_config = LoraConfig(
        r=16,
        lora_alpha=16,
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"
        ],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        inference_mode=False
    )

    print("Applying QLoRA to the base model.")
    model = get_peft_model(model, lora_config)
    print("Loading and processing dataset...")
    raw_dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
    formatted_dataset = raw_dataset.map(format_dataset_entry)

    print("Configuring training...")
    training_args = GRPOConfig(
        use_vllm = False,
        learning_rate = 5e-6,
        adam_beta1 = 0.9,
        adam_beta2 = 0.99,
        weight_decay = 0.1,
        warmup_ratio = 0.1,
        lr_scheduler_type = "cosine",
        optim = "paged_adamw_8bit",
        logging_steps = 1,
        bf16 = is_bfloat16_supported(),
        fp16 = not is_bfloat16_supported(),
        per_device_train_batch_size = 1
        gradient_accumulation_steps = 1,
        num_generations = 6, # Decrease if out of memory
        max_prompt_length = 256,
        max_completion_length = 250,
        max_steps = 250,
        save_steps = 10,
        max_grad_norm = 0.1,
        report_to = "none",
        output_dir = "outputs",
    )

    print("Initializing trainer...")
    trainer = GRPOTrainer(
        model=model,
        processing_class=tokenizer,
        reward_funcs=[
            xmlcount_reward_func,
            soft_format_reward_func,
            strict_format_reward_func,
            int_reward_func,
            correctness_reward_func,
        ],
        args=training_args,
        train_dataset=formatted_dataset,
    )

    print("Starting training...")
    trainer.train()

    print(f"Saving GRPO adapter to {ADAPTER_SAVE_PATH}")
    model.save_pretrained(ADAPTER_SAVE_PATH)
    tokenizer.save_pretrained(ADAPTER_SAVE_PATH)

    print("Loading base model for merging...")
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_NAME,
        torch_dtype=torch.float16,
        device_map={"": torch.cuda.current_device()}
    )
    base_model.config.pad_token_id = tokenizer.pad_token_id

    print("Merging GRPO adapter...")
    grpo_model = PeftModel.from_pretrained(base_model, ADAPTER_SAVE_PATH)
    merged_model = grpo_model.merge_and_unload()

    print(f"Saving merged model to {MERGED_MODEL_PATH}")
    merged_model.save_pretrained(MERGED_MODEL_PATH)
    tokenizer.save_pretrained(MERGED_MODEL_PATH)

    print("Process completed successfully!")

if __name__ == "main":
    main()

 ##########################################################################################################################
 #
"""
We are loading and finetuning the model in 4 bit, but saving the adapter in the full model, this will significantly speed up the training time. For the most part your dataset doesnt need advanced coding info, we just need it to be simple and fit the format well so the model can learn to think. When this is finished you should have a completed finetuned thinking model. This code can be used for smaller models like Llama-3b. Have fun machine learning!

If you crash mid training you can load your latest checkpoint
"""

import sys
import os
import re
import torch
from typing import List

if sys.platform == "win32":
    import types
    resource = types.ModuleType("resource")
    resource.getrlimit = lambda resource_id: (0, 0)
    resource.setrlimit = lambda resource_id, limits: None
    sys.modules["resource"] = resource

from unsloth import FastLanguageModel, PatchFastRL, is_bfloat16_supported
PatchFastRL("GRPO", FastLanguageModel)
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, PeftModel
from sentence_transformers import SentenceTransformer
import numpy as np

embedder = SentenceTransformer("all-MiniLM-L6-v2")
MAX_SEQ_LENGTH = 512
LORA_RANK = 32
BASE_MODEL_NAME = "unsloth/meta-Llama-3.1-8B-instruct"
DATASET_PATH = "enhanced_dataset.jsonl"
ADAPTER_SAVE_PATH = "grpo_adapter"
MERGED_MODEL_PATH = "merged_grpo_full"
CHECKPOINT_PATH = "YOUR_LATEST_CHECKPOINT"
SYSTEM_PROMPT = """
Respond in the following format:
<thinking>
...
</thinking>
<answer>
...
</answer>
"""

def format_dataset_entry(example):
    """Format dataset entries for GRPO training."""
    system_prompt = example.get("instruction", "")
    conversation = example.get("conversation", [])

    messages = [{"role": "system", "content": system_prompt + SYSTEM_PROMPT}]

    if conversation and conversation[-1].get("role") == "assistant":
        for turn in conversation[:-1]:
            messages.append(turn)
        answer = conversation[-1].get("content", "")
    else:
        for turn in conversation:
            messages.append(turn)
        answer = ""

    return {"prompt": messages, "answer": answer}

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]

    print('-' * 20,
          f"Question:\n{q}",
          f"\nAnswer:\n{answer[0]}",
          f"\nResponse:\n{responses[0]}",
          f"\nExtracted:\n{extracted_responses[0]}")

    # Compute embeddings and cosine similarity
    answer_embedding = embedder.encode(answer, convert_to_numpy=True)
    response_embeddings = embedder.encode(extracted_responses, convert_to_numpy=True)

    similarities = [np.dot(r, answer_embedding) / (np.linalg.norm(r) * np.linalg.norm(answer_embedding))
                    for r in response_embeddings]

    # Convert similarity to reward (scaled 0-2 range)
    return [max(0.0, min(2.0, s * 2)) for s in similarities]

def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    pattern = r"^<thinking>\n.*?\n</thinking>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    pattern = r"<thinking>.*?</thinking>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text) -> float:
    count = 0.0
    if text.count("<thinking>\n") == 1:
        count += 0.125
    if text.count("\n</thinking>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

def main():
    print("Loading model and tokenizer...")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=BASE_MODEL_NAME,
        max_seq_length=MAX_SEQ_LENGTH,
        load_in_4bit=True,
        fast_inference=False,
        max_lora_rank=LORA_RANK,
        gpu_memory_utilization=0.9,
        device_map={"": torch.cuda.current_device()}
    )

    print("Applying GRPO adapter...")
    lora_config = LoraConfig(
        r=16,
        lora_alpha=16,
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"
        ],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        inference_mode=False
    )

    print("Applying QLoRA to the base model.")
    model = get_peft_model(model, lora_config)

    print("Loading and processing dataset...")
    raw_dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
    formatted_dataset = raw_dataset.map(format_dataset_entry)

    print("Configuring training...")
    training_args = GRPOConfig(
        use_vllm = False,
        learning_rate = 5e-6,
        adam_beta1 = 0.9,
        adam_beta2 = 0.99,
        weight_decay = 0.1,
        warmup_ratio = 0.1,
        lr_scheduler_type = "cosine",
        optim = "paged_adamw_8bit",
        logging_steps = 1,
        bf16 = is_bfloat16_supported(),
        fp16 = not is_bfloat16_supported(),
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        num_generations = 6,
        max_prompt_length = 256,
        max_completion_length = 250,
        num_train_epochs = 1,
        max_steps = 250,
        save_steps = 10,
        max_grad_norm = 0.1,
        report_to = "none",
        output_dir = "outputs",
    )

    print("Initializing trainer...")
    trainer = GRPOTrainer(
        model=model,
        processing_class=tokenizer,
        reward_funcs=[
            xmlcount_reward_func,
            soft_format_reward_func,
            strict_format_reward_func,
            int_reward_func,
            correctness_reward_func,
        ],
        args=training_args,
        train_dataset=formatted_dataset,
    )

    print("Starting training...")
    try:
        if os.path.exists(CHECKPOINT_PATH):
            print(f"Resuming training from checkpoint: {CHECKPOINT_PATH}")
            trainer.train(resume_from_checkpoint=CHECKPOINT_PATH)
        else:
            print("No checkpoint found; starting training from scratch...")
            trainer.train()

        # Save the adapter
        print(f"Saving GRPO adapter to {ADAPTER_SAVE_PATH}")
        if not os.path.exists(ADAPTER_SAVE_PATH):
            os.makedirs(ADAPTER_SAVE_PATH)
        model.save_pretrained(ADAPTER_SAVE_PATH)
        tokenizer.save_pretrained(ADAPTER_SAVE_PATH)

    except Exception as e:
        print(f"Error during training or saving: {str(e)}")
        raise

    try:
        print("Loading base model in full precision...")
        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL_NAME,
            torch_dtype=torch.float16,
            device_map={"": torch.cuda.current_device()}
        )

        base_model.config.pad_token_id = tokenizer.pad_token_id

        print("Loading and merging GRPO adapter...")
        grpo_model = PeftModel.from_pretrained(base_model, ADAPTER_SAVE_PATH)
        merged_model = grpo_model.merge_and_unload()

        if not os.path.exists(MERGED_MODEL_PATH):
            os.makedirs(MERGED_MODEL_PATH)

        print(f"Saving merged model to {MERGED_MODEL_PATH}")
        merged_model.save_pretrained(MERGED_MODEL_PATH)
        tokenizer.save_pretrained(MERGED_MODEL_PATH)

        print("Process completed successfully!")

    except Exception as e:
        print(f"Error during model merging: {str(e)}")
        raise

if __name__ == "__main__":
    main()