Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- import os
- import re
- import torch
- # ВАЖНО: unsloth надо импортировать до transformers/peft
- import unsloth
- from unsloth import FastLanguageModel
- from datasets import Dataset
- from transformers import Trainer, TrainingArguments, AutoTokenizer
- from typing import List, Dict
- from dataclasses import dataclass
- # ─── CONFIG ────────────────────────────────────────────────────────────────
- BASE_MODEL_DIR = "./Vikhr-Nemo-12B-Instruct-R-21-09-24"
- DATA_FILE = "./dataset.txt"
- OUTPUT_DIR = "./lora_out"
- # Жёстко ограничиваем max context длиной 2048
- MAX_LEN = 2048
- def bf16_ok():
- return torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8
- dtype = torch.bfloat16 if bf16_ok() else torch.float16
- print("Using dtype =", dtype)
- # ─── 1. ЗАГРУЖАЕМ ДАННЫЕ ─────────────────────────────────────────────────────
- with open(DATA_FILE, encoding="utf-8") as f:
- raw = f.read().strip().split("\n\n\n")
- dataset = Dataset.from_list([{"text": block} for block in raw])
- print(f"Loaded {len(dataset)} dialogues")
- # ─── 2. ИНИЦИАЛИЗАЦИЯ МОДЕЛИ ─────────────────────────────────────────────────
- model, tokenizer = FastLanguageModel.from_pretrained(
- BASE_MODEL_DIR,
- dtype=dtype,
- load_in_4bit=False
- )
- # Принудительно ставим модельный max_length = 2048
- tokenizer.model_max_length = MAX_LEN
- print("Forced model max_length =", MAX_LEN)
- # ─── 3. ВСТАВЛЯЕМ LoRA ───────────────────────────────────────────────────────
- model = FastLanguageModel.get_peft_model(
- model,
- r = 16,
- lora_alpha = 32,
- target_modules = [
- "q_proj","k_proj","v_proj","o_proj",
- "gate_proj","up_proj","down_proj"
- ],
- lora_dropout = 0.05,
- bias = "none"
- )
- # ─── 4. ТОКЕНИЗАЦИЯ + МАСКИРОВКА ─────────────────────────────────────────────
- speaker_re = re.compile(r"(USER:|ASSISTANT:)", flags=re.IGNORECASE)
- def tok_fn(batch: Dict[str, List[str]]) -> Dict[str, List[List[int]]]:
- all_input_ids = []
- all_attn_masks = []
- all_labels = []
- for text in batch["text"]:
- # Разбиваем на сегменты по USER:/ASSISTANT:
- parts = speaker_re.split(text)
- utterances = []
- for i in range(1, len(parts), 2):
- speaker = parts[i][:-1].upper() # "USER" или "ASSISTANT"
- content = parts[i] + parts[i+1]
- utterances.append((speaker, content))
- ids: List[int] = []
- masks: List[int] = []
- labels: List[int] = []
- for speaker, utt in utterances:
- toks = tokenizer(utt, add_special_tokens=False)["input_ids"]
- for tid in toks:
- ids.append(tid)
- masks.append(1)
- labels.append(tid if speaker == "ASSISTANT" else -100)
- # EOS токен (игнорим в лейблах)
- if tokenizer.eos_token_id is not None:
- ids.append(tokenizer.eos_token_id)
- masks.append(1)
- labels.append(-100)
- # Жёстко обрезаем всё до MAX_LEN
- if len(ids) > MAX_LEN:
- ids = ids[:MAX_LEN]
- masks = masks[:MAX_LEN]
- labels = labels[:MAX_LEN]
- all_input_ids.append(ids)
- all_attn_masks.append(masks)
- all_labels.append(labels)
- return {
- "input_ids": all_input_ids,
- "attention_mask": all_attn_masks,
- "labels": all_labels,
- }
- tokenized = dataset.map(
- tok_fn,
- batched=True,
- remove_columns=["text"]
- )
- print("Tokenized; sample lengths:",
- len(tokenized[0]["input_ids"]), len(tokenized[0]["labels"]))
- # ─── 5. COLLATОР С ОБРЕЗКОЙ ──────────────────────────────────────────────────
- @dataclass
- class CollatorWithTruncation:
- tokenizer: AutoTokenizer
- max_length: int
- def __call__(self, features: List[Dict[str, List[int]]]) -> Dict[str, torch.Tensor]:
- # Паддинг по longest, но все не более max_length
- batch = self.tokenizer.pad(
- features,
- padding="longest",
- return_tensors="pt"
- )
- bsz, seq_len = batch["input_ids"].shape
- for key, pad_value in [
- ("input_ids", self.tokenizer.pad_token_id),
- ("attention_mask", 0),
- ("labels", -100)
- ]:
- data = batch[key]
- if seq_len > self.max_length:
- data = data[:, :self.max_length]
- elif seq_len < self.max_length:
- pad_shape = (bsz, self.max_length - seq_len)
- pad_tensor = torch.full(pad_shape, pad_value, dtype=data.dtype)
- data = torch.cat([data, pad_tensor], dim=1)
- batch[key] = data
- return batch
- collator = CollatorWithTruncation(tokenizer, MAX_LEN)
- # ─── 6. TRAINING ARGS ───────────────────────────────────────────────────────
- args = TrainingArguments(
- output_dir = OUTPUT_DIR,
- per_device_train_batch_size = 1,
- gradient_accumulation_steps = 4,
- learning_rate = 2e-4,
- num_train_epochs = 1,
- warmup_ratio = 0.03,
- lr_scheduler_type = "cosine",
- fp16 = not bf16_ok(),
- bf16 = bf16_ok(),
- logging_steps = 10,
- save_strategy = "epoch",
- )
- trainer = Trainer(
- model = model,
- args = args,
- train_dataset = tokenized,
- data_collator = collator,
- tokenizer = tokenizer
- )
- # ─── 7. TRAIN! ───────────────────────────────────────────────────────────────
- trainer.train()
- # ─── 8. SAVE ────────────────────────────────────────────────────────────────
- os.makedirs(OUTPUT_DIR, exist_ok=True)
- model.save_pretrained(OUTPUT_DIR, safe_serialization=True)
- tokenizer.save_pretrained(OUTPUT_DIR)
- print("✓ LoRA saved to", os.path.join(OUTPUT_DIR, "adapter_model.safetensors"))
Advertisement
Add Comment
Please, Sign In to add comment