Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import random
- import pandas as pd
- import torch
- from datasets import load_dataset, Dataset
- from transformers import (
- AutoModelForCausalLM,
- AutoTokenizer,
- BitsAndBytesConfig,
- TrainingArguments,
- )
- from peft import LoraConfig, PeftModel
- from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
- import os
- os.environ['PYTORCH_CUDA_ALLOC_CONF']='expandable_segments:True'
- # General parameters
- # model_name = "EleutherAI/pythia-14m" # The model that you want to train from the Hugging Face hub
- model_name = "gemma-3-12b-it"
- dataset_name = "criteria_extraction_dataset_bootstrapped_allowed.csv" # The instruction dataset to use
- # Load tokenizer
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
- tokenizer.pad_token = tokenizer.eos_token
- tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
- # LoRA parameters
- lora_r = 128
- lora_alpha = lora_r * 2
- lora_dropout = 0.1
- target_modules = "all-linear"
- # QLoRA parameters
- bnb_bits_count = 4
- if bnb_bits_count == 4:
- bnb_params = {
- "load_in_4bit": True,
- "bnb_4bit_compute_dtype": 'float16',
- "bnb_4bit_quant_type": "nf4",
- "bnb_4bit_use_double_quant": False,
- }
- else:
- bnb_params = {
- "load_in_8bit": True,
- "bnb_8bit_compute_dtype": 'float16',
- "bnb_8bit_quant_type": "fp8", # or "int8"
- "use_nested_quant": False, # Similar to double_quant in 4-bit
- }
- # TrainingArguments parameters
- num_train_epochs = 1
- fp16 = False
- bf16 = False
- per_device_train_batch_size = 4
- gradient_accumulation_steps = 1
- gradient_checkpointing = True
- learning_rate = 0.00015
- weight_decay = 0.01
- optim = "paged_adamw_32bit"
- lr_scheduler_type = "cosine"
- max_steps = -1
- warmup_ratio = 0.03
- group_by_length = True
- save_steps = 0.2
- logging_steps = 5
- eval_on_start = True
- eval_strategy = 'steps'
- eval_steps = 10
- max_grad_norm=0.5
- eval_sample_size=64
- new_model = f"gemma3-12b-it_bootstrapped_allowed_{num_train_epochs}epochs_{per_device_train_batch_size * gradient_accumulation_steps}bs_6k_dataset" # The name for fine-tuned LoRA Adaptor
- # SFT parameters
- max_seq_length = None
- packing = True
- # device_map = {"": 0}
- # Dataset parameters
- # for saiga-criteria-maker
- PROMPT = 'Задача: тебе придет расшифровка текста (часть телефонного разговора) потенциального покупателя. Нужно проанализировать её и выявить критерии, которые потенциальный покупатель считает важными для принятия решения о покупке. Используй только термины из следующего списка: ["Цена", "Качество", "Бренд", "Отзывы", "Функциональность", "Гарантия", "Доступность", "Удобство", "Экологичность", "Скидки и акции", "Сервис и поддержка", "Совместимость", "Логистика", "Условия оплаты", "Безопасность", "Внешний вид", "Технические качества ТТХ", "Надежность", "Долговечность"]. Представь свой ответ в виде списка, заключенного в квадратные скобки, где элементы списка - соответствующие критерии.\nРасшифровка:\n'
- use_special_template = False
- response_template = '### Answer:'
- instruction_prompt_template = '### Human: '
- use_llama_like_model = False
- # Load dataset (you can process it here)
- # dataset = load_dataset(dataset_name, split="train")
- df = pd.read_csv(dataset_name, sep=';', encoding='utf-8').sample(frac=1, random_state=42)
- other_columns = [col for col in df.columns if col not in ['customer_speech', 'corrected_allowed_criteria']]
- df = df.drop(columns=other_columns)
- # df['prompt'] = df['customer_speech'].apply(lambda x: PROMPT + x)
- dataset = Dataset.from_pandas(df)
- percent_of_train_dataset = 0.9
- split_dataset = dataset.train_test_split(train_size=int(dataset.num_rows * percent_of_train_dataset), seed=19, shuffle=False)
- train_dataset = split_dataset["train"]
- eval_dataset = split_dataset["test"]
- print(f"Size of the train set: {len(train_dataset)}. Size of the validation set: {len(eval_dataset)}")
- # Load LoRA configuration
- peft_config = LoraConfig(
- r=lora_r,
- lora_alpha=lora_alpha,
- lora_dropout=lora_dropout,
- bias="none",
- task_type="CAUSAL_LM",
- target_modules=target_modules
- )
- bnb_config = BitsAndBytesConfig(**bnb_params)
- # Load base model
- model = AutoModelForCausalLM.from_pretrained(
- model_name,
- quantization_config=bnb_config,
- #device_map=device_map,
- )
- model.config.use_cache = False
- # Set training parameters
- training_arguments = TrainingArguments(
- output_dir=new_model,
- num_train_epochs=num_train_epochs,
- per_device_train_batch_size=per_device_train_batch_size,
- gradient_accumulation_steps=gradient_accumulation_steps,
- optim=optim,
- save_steps=save_steps,
- logging_steps=logging_steps,
- learning_rate=learning_rate,
- weight_decay=weight_decay,
- fp16=fp16,
- bf16=bf16,
- max_steps=max_steps,
- warmup_ratio=warmup_ratio,
- gradient_checkpointing=gradient_checkpointing,
- group_by_length=group_by_length,
- lr_scheduler_type=lr_scheduler_type,
- eval_on_start=eval_on_start,
- logging_first_step=True,
- eval_strategy=eval_strategy,
- eval_steps=eval_steps,
- max_grad_norm=max_grad_norm,
- )
- def special_formatting_prompts(example):
- output_texts = []
- for i in range(len(example['instruction'])):
- text = f"{instruction_prompt_template}{example['instruction'][i]}\n{response_template} {example['output'][i]}"
- output_texts.append(text)
- return output_texts
- def normal_formatting_prompts(example):
- output_texts = []
- for i in range(len(example['customer_speech'])):
- chat_temp = [{"role": "user", "content": PROMPT + example['customer_speech'][i]},
- {"role": "assistant", "content": example['corrected_allowed_criteria'][i]}]
- text = tokenizer.apply_chat_template(chat_temp, tokenize=False)
- output_texts.append(text)
- return output_texts
- if use_special_template:
- formatting_func = special_formatting_prompts
- if use_llama_like_model:
- response_template_ids = tokenizer.encode(response_template, add_special_tokens=False)[2:]
- collator = DataCollatorForCompletionOnlyLM(response_template=response_template_ids, tokenizer=tokenizer)
- else:
- collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)
- else:
- formatting_func = normal_formatting_prompts
- class SFTTrainerEvalSampling(SFTTrainer):
- def __init__(self, *args, eval_sample_size=16, **kwargs):
- super().__init__(*args, **kwargs)
- self.eval_sample_size = eval_sample_size
- def get_eval_dataloader(self, eval_dataset=None):
- '''
- Samples the evaluation dataset and returns a subset
- of size self.eval_sample_size.
- '''
- if eval_dataset is None:
- eval_dataset = self.eval_dataset
- idxs = random.sample(range(len(eval_dataset)), self.eval_sample_size)
- eval_subset = eval_dataset.select(idxs)
- return super().get_eval_dataloader(eval_subset)
- trainer = SFTTrainerEvalSampling(
- model=model,
- train_dataset=train_dataset,
- eval_dataset=eval_dataset,
- peft_config=peft_config,
- formatting_func=formatting_func,
- # data_collator=collator,
- # max_seq_length=max_seq_length,
- tokenizer=tokenizer,
- args=training_arguments,
- eval_sample_size=eval_sample_size,
- )
- # Train model
- trainer.train()
- # Save fine tuned Lora Adaptor
- trainer.model.save_pretrained(new_model)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement