Advertisement
Guest User

Untitled

a guest
Mar 13th, 2025
43
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.91 KB | None | 0 0
  1. import random
  2. import pandas as pd
  3. import torch
  4. from datasets import load_dataset, Dataset
  5. from transformers import (
  6. AutoModelForCausalLM,
  7. AutoTokenizer,
  8. BitsAndBytesConfig,
  9. TrainingArguments,
  10. )
  11. from peft import LoraConfig, PeftModel
  12. from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
  13.  
  14. import os
  15. os.environ['PYTORCH_CUDA_ALLOC_CONF']='expandable_segments:True'
  16.  
  17. # General parameters
  18. # model_name = "EleutherAI/pythia-14m" # The model that you want to train from the Hugging Face hub
  19. model_name = "gemma-3-12b-it"
  20. dataset_name = "criteria_extraction_dataset_bootstrapped_allowed.csv" # The instruction dataset to use
  21.  
  22. # Load tokenizer
  23. tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
  24. tokenizer.pad_token = tokenizer.eos_token
  25. tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
  26.  
  27. # LoRA parameters
  28. lora_r = 128
  29. lora_alpha = lora_r * 2
  30. lora_dropout = 0.1
  31. target_modules = "all-linear"
  32.  
  33. # QLoRA parameters
  34. bnb_bits_count = 4
  35.  
  36. if bnb_bits_count == 4:
  37. bnb_params = {
  38. "load_in_4bit": True,
  39. "bnb_4bit_compute_dtype": 'float16',
  40. "bnb_4bit_quant_type": "nf4",
  41. "bnb_4bit_use_double_quant": False,
  42. }
  43. else:
  44. bnb_params = {
  45. "load_in_8bit": True,
  46. "bnb_8bit_compute_dtype": 'float16',
  47. "bnb_8bit_quant_type": "fp8", # or "int8"
  48. "use_nested_quant": False, # Similar to double_quant in 4-bit
  49. }
  50.  
  51. # TrainingArguments parameters
  52. num_train_epochs = 1
  53. fp16 = False
  54. bf16 = False
  55. per_device_train_batch_size = 4
  56. gradient_accumulation_steps = 1
  57. gradient_checkpointing = True
  58. learning_rate = 0.00015
  59. weight_decay = 0.01
  60. optim = "paged_adamw_32bit"
  61. lr_scheduler_type = "cosine"
  62. max_steps = -1
  63. warmup_ratio = 0.03
  64. group_by_length = True
  65. save_steps = 0.2
  66. logging_steps = 5
  67. eval_on_start = True
  68. eval_strategy = 'steps'
  69. eval_steps = 10
  70. max_grad_norm=0.5
  71. eval_sample_size=64
  72.  
  73. new_model = f"gemma3-12b-it_bootstrapped_allowed_{num_train_epochs}epochs_{per_device_train_batch_size * gradient_accumulation_steps}bs_6k_dataset" # The name for fine-tuned LoRA Adaptor
  74.  
  75. # SFT parameters
  76. max_seq_length = None
  77. packing = True
  78. # device_map = {"": 0}
  79.  
  80. # Dataset parameters
  81. # for saiga-criteria-maker
  82. PROMPT = 'Задача: тебе придет расшифровка текста (часть телефонного разговора) потенциального покупателя. Нужно проанализировать её и выявить критерии, которые потенциальный покупатель считает важными для принятия решения о покупке. Используй только термины из следующего списка: ["Цена", "Качество", "Бренд", "Отзывы", "Функциональность", "Гарантия", "Доступность", "Удобство", "Экологичность", "Скидки и акции", "Сервис и поддержка", "Совместимость", "Логистика", "Условия оплаты", "Безопасность", "Внешний вид", "Технические качества ТТХ", "Надежность", "Долговечность"]. Представь свой ответ в виде списка, заключенного в квадратные скобки, где элементы списка - соответствующие критерии.\nРасшифровка:\n'
  83. use_special_template = False
  84. response_template = '### Answer:'
  85. instruction_prompt_template = '### Human: '
  86. use_llama_like_model = False
  87.  
  88. # Load dataset (you can process it here)
  89. # dataset = load_dataset(dataset_name, split="train")
  90. df = pd.read_csv(dataset_name, sep=';', encoding='utf-8').sample(frac=1, random_state=42)
  91. other_columns = [col for col in df.columns if col not in ['customer_speech', 'corrected_allowed_criteria']]
  92. df = df.drop(columns=other_columns)
  93. # df['prompt'] = df['customer_speech'].apply(lambda x: PROMPT + x)
  94.  
  95. dataset = Dataset.from_pandas(df)
  96.  
  97. percent_of_train_dataset = 0.9
  98. split_dataset = dataset.train_test_split(train_size=int(dataset.num_rows * percent_of_train_dataset), seed=19, shuffle=False)
  99. train_dataset = split_dataset["train"]
  100. eval_dataset = split_dataset["test"]
  101. print(f"Size of the train set: {len(train_dataset)}. Size of the validation set: {len(eval_dataset)}")
  102.  
  103.  
  104. # Load LoRA configuration
  105. peft_config = LoraConfig(
  106. r=lora_r,
  107. lora_alpha=lora_alpha,
  108. lora_dropout=lora_dropout,
  109. bias="none",
  110. task_type="CAUSAL_LM",
  111. target_modules=target_modules
  112. )
  113.  
  114. bnb_config = BitsAndBytesConfig(**bnb_params)
  115.  
  116. # Load base model
  117. model = AutoModelForCausalLM.from_pretrained(
  118. model_name,
  119. quantization_config=bnb_config,
  120. #device_map=device_map,
  121. )
  122. model.config.use_cache = False
  123.  
  124.  
  125. # Set training parameters
  126. training_arguments = TrainingArguments(
  127. output_dir=new_model,
  128. num_train_epochs=num_train_epochs,
  129. per_device_train_batch_size=per_device_train_batch_size,
  130. gradient_accumulation_steps=gradient_accumulation_steps,
  131. optim=optim,
  132. save_steps=save_steps,
  133. logging_steps=logging_steps,
  134. learning_rate=learning_rate,
  135. weight_decay=weight_decay,
  136. fp16=fp16,
  137. bf16=bf16,
  138. max_steps=max_steps,
  139. warmup_ratio=warmup_ratio,
  140. gradient_checkpointing=gradient_checkpointing,
  141. group_by_length=group_by_length,
  142. lr_scheduler_type=lr_scheduler_type,
  143. eval_on_start=eval_on_start,
  144. logging_first_step=True,
  145. eval_strategy=eval_strategy,
  146. eval_steps=eval_steps,
  147. max_grad_norm=max_grad_norm,
  148. )
  149.  
  150.  
  151. def special_formatting_prompts(example):
  152. output_texts = []
  153. for i in range(len(example['instruction'])):
  154. text = f"{instruction_prompt_template}{example['instruction'][i]}\n{response_template} {example['output'][i]}"
  155. output_texts.append(text)
  156. return output_texts
  157.  
  158.  
  159. def normal_formatting_prompts(example):
  160. output_texts = []
  161. for i in range(len(example['customer_speech'])):
  162. chat_temp = [{"role": "user", "content": PROMPT + example['customer_speech'][i]},
  163. {"role": "assistant", "content": example['corrected_allowed_criteria'][i]}]
  164. text = tokenizer.apply_chat_template(chat_temp, tokenize=False)
  165. output_texts.append(text)
  166. return output_texts
  167.  
  168. if use_special_template:
  169. formatting_func = special_formatting_prompts
  170. if use_llama_like_model:
  171. response_template_ids = tokenizer.encode(response_template, add_special_tokens=False)[2:]
  172. collator = DataCollatorForCompletionOnlyLM(response_template=response_template_ids, tokenizer=tokenizer)
  173. else:
  174. collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)
  175. else:
  176. formatting_func = normal_formatting_prompts
  177.  
  178.  
  179. class SFTTrainerEvalSampling(SFTTrainer):
  180. def __init__(self, *args, eval_sample_size=16, **kwargs):
  181. super().__init__(*args, **kwargs)
  182. self.eval_sample_size = eval_sample_size
  183.  
  184. def get_eval_dataloader(self, eval_dataset=None):
  185. '''
  186. Samples the evaluation dataset and returns a subset
  187. of size self.eval_sample_size.
  188. '''
  189. if eval_dataset is None:
  190. eval_dataset = self.eval_dataset
  191. idxs = random.sample(range(len(eval_dataset)), self.eval_sample_size)
  192. eval_subset = eval_dataset.select(idxs)
  193. return super().get_eval_dataloader(eval_subset)
  194.  
  195.  
  196. trainer = SFTTrainerEvalSampling(
  197. model=model,
  198. train_dataset=train_dataset,
  199. eval_dataset=eval_dataset,
  200. peft_config=peft_config,
  201. formatting_func=formatting_func,
  202. # data_collator=collator,
  203. # max_seq_length=max_seq_length,
  204. tokenizer=tokenizer,
  205. args=training_arguments,
  206. eval_sample_size=eval_sample_size,
  207. )
  208.  
  209. # Train model
  210. trainer.train()
  211.  
  212. # Save fine tuned Lora Adaptor
  213. trainer.model.save_pretrained(new_model)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement