Advertisement
tastypear

try to fix llama converter

Nov 18th, 2023 (edited)
597
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 50.40 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. from __future__ import annotations
  3.  
  4. import argparse
  5. import concurrent.futures
  6. import enum
  7. import faulthandler
  8. import functools
  9. import itertools
  10. import json
  11. import math
  12. import mmap
  13. import pickle
  14. import re
  15. import signal
  16. import struct
  17. import sys
  18. import time
  19. import zipfile
  20. from abc import ABCMeta, abstractmethod
  21. from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
  22. from dataclasses import dataclass
  23. from pathlib import Path
  24. from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
  25.  
  26. import numpy as np
  27. from sentencepiece import SentencePieceProcessor
  28.  
  29. import os
  30. if 'NO_LOCAL_GGUF' not in os.environ:
  31.     sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
  32. import gguf
  33.  
  34. if TYPE_CHECKING:
  35.     from typing import TypeAlias
  36.  
  37. if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
  38.     faulthandler.register(signal.SIGUSR1)
  39.  
  40. NDArray: TypeAlias = 'np.ndarray[Any, Any]'
  41.  
  42. ARCH = gguf.MODEL_ARCH.LLAMA
  43.  
  44. DEFAULT_CONCURRENCY = 8
  45. #
  46. # data types
  47. #
  48.  
  49. @dataclass(frozen=True)
  50. class DataType:
  51.     name: str
  52.     dtype: np.dtype[Any]
  53.     valid_conversions: list[str]
  54.  
  55.     def elements_to_bytes(self, n_elements: int) -> int:
  56.         return n_elements * self.dtype.itemsize
  57.  
  58. @dataclass(frozen=True)
  59. class UnquantizedDataType(DataType):
  60.     pass
  61.  
  62. DT_F16  = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
  63. DT_F32  = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
  64. DT_I32  = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
  65. DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
  66.  
  67. @dataclass(frozen=True)
  68. class QuantizedDataType(DataType):
  69.     block_size: int
  70.     quantized_dtype: np.dtype[Any]
  71.     ggml_type: gguf.GGMLQuantizationType
  72.  
  73.     def quantize(self, arr: NDArray) -> NDArray:
  74.         raise NotImplementedError(f'Quantization for {self.name} not implemented')
  75.  
  76.     def elements_to_bytes(self, n_elements: int) -> int:
  77.         assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
  78.         return self.quantized_dtype.itemsize * (n_elements // self.block_size)
  79.  
  80. @dataclass(frozen=True)
  81. class Q8_0QuantizedDataType(QuantizedDataType):
  82.     # Mini Q8_0 quantization in Python!
  83.     def quantize(self, arr: NDArray) -> NDArray:
  84.         assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}'
  85.         assert arr.dtype == np.float32, f'Bad array type {arr.dtype}'
  86.         n_blocks = arr.size // self.block_size
  87.         blocks = arr.reshape((n_blocks, self.block_size))
  88.         # Much faster implementation of block quantization contributed by @Cebtenzzre
  89.         def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
  90.             d = abs(blocks).max(axis = 1) / np.float32(127)
  91.             with np.errstate(divide = 'ignore'):
  92.                 qs = (blocks / d[:, None]).round()
  93.             qs[d == 0] = 0
  94.             yield from zip(d, qs)
  95.         return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
  96.  
  97. DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
  98.     dtype = np.dtype(np.float32), valid_conversions = [],
  99.     ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
  100.     quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
  101.  
  102. # Quantized types skipped here because they may also map to np.float32
  103. NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
  104. for dt in (DT_BF16, DT_F16, DT_F32, DT_I32):
  105.     if dt.dtype in NUMPY_TYPE_TO_DATA_TYPE:
  106.         raise ValueError(f'Invalid duplicate data type {dt}')
  107.     NUMPY_TYPE_TO_DATA_TYPE[dt.dtype] = dt
  108.  
  109. SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
  110.     'BF16': DT_BF16,
  111.     'F16': DT_F16,
  112.     'F32': DT_F32,
  113.     'I32': DT_I32,
  114. }
  115.  
  116. # TODO: match this with `llama_ftype`
  117. # TODO: rename to LLAMAFileType
  118. # TODO: move to `gguf.py`
  119. class GGMLFileType(enum.IntEnum):
  120.     AllF32     = 0
  121.     MostlyF16  = 1  # except 1d tensors
  122.     MostlyQ8_0 = 7  # except 1d tensors
  123.  
  124.     def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType:
  125.         dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
  126.         if dt is None:
  127.             raise ValueError(self)
  128.         # 1D tensors are always F32.
  129.         return dt if len(tensor.shape) > 1 else DT_F32
  130.  
  131. GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
  132.     GGMLFileType.AllF32    : DT_F32,
  133.     GGMLFileType.MostlyF16 : DT_F16,
  134.     GGMLFileType.MostlyQ8_0: DT_Q8_0,
  135. }
  136.  
  137. #
  138. # hparams loading
  139. #
  140.  
  141. @dataclass
  142. class Params:
  143.     n_vocab:    int
  144.     n_embd:     int
  145.     n_layer:    int
  146.     n_ctx:      int
  147.     n_ff:       int
  148.     n_head:     int
  149.     n_head_kv:  int
  150.     f_norm_eps: float
  151.  
  152.     rope_scaling_type: gguf.RopeScalingType | None = None
  153.     f_rope_freq_base: float | None = None
  154.     f_rope_scale: float | None = None
  155.     n_orig_ctx: int | None = None
  156.     rope_finetuned: bool | None = None
  157.  
  158.     ftype: GGMLFileType | None = None
  159.  
  160.     # path to the directory containing the model files
  161.     path_model: Path | None = None
  162.  
  163.     @staticmethod
  164.     def guessed(model: LazyModel) -> Params:
  165.         # try transformer naming first
  166.         n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
  167.  
  168.         # try transformer naming first
  169.         if "model.layers.0.self_attn.q_proj.weight" in model:
  170.             n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
  171.         elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
  172.             n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
  173.         else:
  174.             n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
  175.  
  176.         if n_layer < 1:
  177.             raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
  178.                             "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
  179.  
  180.         n_head = n_embd // 128 # guessed
  181.         n_mult = 256           # guessed
  182.  
  183.         # TODO: verify this
  184.         n_ff = int(2 * (4 * n_embd) / 3)
  185.         n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
  186.  
  187.         return Params(
  188.             n_vocab    = n_vocab,
  189.             n_embd     = n_embd,
  190.             n_layer    = n_layer,
  191.             n_ctx      = -1,
  192.             n_ff       = n_ff,
  193.             n_head     = n_head,
  194.             n_head_kv  = n_head,
  195.             f_norm_eps = 1e-5,
  196.         )
  197.  
  198.     @staticmethod
  199.     def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
  200.         config = json.load(open(config_path))
  201.  
  202.         rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
  203.         rope_scaling = config.get("rope_scaling")
  204.  
  205.         if rope_scaling is not None and (typ := rope_scaling.get("type")):
  206.             rope_factor = rope_scaling.get("factor")
  207.             f_rope_scale = rope_factor
  208.             if typ == "linear":
  209.                 rope_scaling_type = gguf.RopeScalingType.LINEAR
  210.             elif typ == "yarn":
  211.                 rope_scaling_type = gguf.RopeScalingType.YARN
  212.                 n_orig_ctx = rope_scaling['original_max_position_embeddings']
  213.                 rope_finetuned = rope_scaling['finetuned']
  214.             else:
  215.                 raise NotImplementedError(f'Unknown rope scaling type: {typ}')
  216.  
  217.         if "max_sequence_length" in config:
  218.             n_ctx = config["max_sequence_length"]
  219.         elif "max_position_embeddings" in config:
  220.             n_ctx = config["max_position_embeddings"]
  221.         else:
  222.             raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
  223.                             "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
  224.  
  225.         return Params(
  226.             n_vocab           = config["vocab_size"],
  227.             n_embd            = config["hidden_size"],
  228.             n_layer           = config["num_hidden_layers"],
  229.             n_ctx             = n_ctx,
  230.             n_ff              = config["intermediate_size"],
  231.             n_head            = (n_head := config["num_attention_heads"]),
  232.             n_head_kv         = config.get("num_key_value_heads", n_head),
  233.             f_norm_eps        = config["rms_norm_eps"],
  234.             f_rope_freq_base  = config.get("rope_theta"),
  235.             rope_scaling_type = rope_scaling_type,
  236.             f_rope_scale      = f_rope_scale,
  237.             n_orig_ctx        = n_orig_ctx,
  238.             rope_finetuned    = rope_finetuned,
  239.         )
  240.  
  241.     # LLaMA v2 70B params.json
  242.     # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
  243.     @staticmethod
  244.     def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
  245.         config = json.load(open(config_path))
  246.  
  247.         # hack to determine LLaMA v1 vs v2 vs CodeLlama
  248.         if config.get("rope_theta") == 1000000:
  249.             # CodeLlama
  250.             n_ctx = 16384
  251.         elif config["norm_eps"] == 1e-05:
  252.             # LLaMA v2
  253.             n_ctx = 4096
  254.         else:
  255.             # LLaMA v1
  256.             n_ctx = 2048
  257.  
  258.         return Params(
  259.             n_vocab          = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
  260.             n_embd           = config["dim"],
  261.             n_layer          = config["n_layers"],
  262.             n_ctx            = n_ctx,
  263.             n_ff             = model["layers.0.feed_forward.w1.weight"].shape[0],
  264.             n_head           = (n_head := config["n_heads"]),
  265.             n_head_kv        = config.get("n_kv_heads", n_head),
  266.             f_norm_eps       = config["norm_eps"],
  267.             f_rope_freq_base = config.get("rope_theta"),
  268.         )
  269.  
  270.     @staticmethod
  271.     def load(model_plus: ModelPlus) -> Params:
  272.         hf_config_path   = model_plus.paths[0].parent / "config.json"
  273.         orig_config_path = model_plus.paths[0].parent / "params.json"
  274.  
  275.         if hf_config_path.exists():
  276.             params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
  277.         elif orig_config_path.exists():
  278.             params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
  279.         elif model_plus.format != 'none':
  280.             params = Params.guessed(model_plus.model)
  281.         else:
  282.             raise ValueError('Cannot guess params when model format is none')
  283.  
  284.         params.path_model = model_plus.paths[0].parent
  285.  
  286.         return params
  287.  
  288.  
  289. #
  290. # vocab
  291. #
  292.  
  293. class BpeVocab:
  294.     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
  295.         self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
  296.         added_tokens: dict[str, int]
  297.         if fname_added_tokens is not None:
  298.             # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
  299.             added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
  300.         else:
  301.             # Fall back to trying to find the added tokens in tokenizer.json
  302.             tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
  303.             if not tokenizer_json_file.is_file():
  304.                 added_tokens = {}
  305.             else:
  306.                 tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
  307.                 added_tokens = dict(
  308.                     (item['content'], item['id'])
  309.                     for item in tokenizer_json.get('added_tokens', [])
  310.                     # Added tokens here can be duplicates of the main vocabulary.
  311.                     if item['content'] not in self.bpe_tokenizer )
  312.  
  313.         vocab_size: int = len(self.bpe_tokenizer)
  314.         expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
  315.         actual_ids      = sorted(added_tokens.values())
  316.         if expected_ids != actual_ids:
  317.             expected_end_id = vocab_size + len(actual_ids) - 1
  318.             raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
  319.  
  320.         items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
  321.         self.added_tokens_list    = [text for (text, idx) in items]
  322.         self.vocab_size_base: int = vocab_size
  323.         self.vocab_size: int      = self.vocab_size_base + len(self.added_tokens_list)
  324.         self.fname_tokenizer      = fname_tokenizer
  325.         self.fname_added_tokens   = fname_added_tokens
  326.  
  327.     def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
  328.         tokenizer = self.bpe_tokenizer
  329.         from transformers.models.gpt2 import tokenization_gpt2
  330.         reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
  331.  
  332.         for i, _ in enumerate(tokenizer):
  333.             yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
  334.  
  335.     def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
  336.         for text in self.added_tokens_list:
  337.             score = -1000.0
  338.             yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
  339.  
  340.     def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
  341.         yield from self.bpe_tokens()
  342.         yield from self.added_tokens()
  343.  
  344.     def __repr__(self) -> str:
  345.         return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
  346.  
  347.  
  348. class SentencePieceVocab:
  349.     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
  350.         self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
  351.         added_tokens: dict[str, int]
  352.         if fname_added_tokens is not None:
  353.             added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
  354.         else:
  355.             added_tokens = {}
  356.  
  357.         vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
  358.  
  359.         new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
  360.         expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
  361.         actual_new_ids   = sorted(new_tokens.keys())
  362.  
  363.         if expected_new_ids != actual_new_ids:
  364.             raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
  365.  
  366.         # Token pieces that were added to the base vocabulary.
  367.         self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
  368.         self.vocab_size_base    = vocab_size
  369.         self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
  370.         self.fname_tokenizer    = fname_tokenizer
  371.         self.fname_added_tokens = fname_added_tokens
  372.  
  373.     def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
  374.         tokenizer = self.sentencepiece_tokenizer
  375.         for i in range(tokenizer.vocab_size()):
  376.             piece = tokenizer.id_to_piece(i)
  377.             text: bytes = piece.encode("utf-8")
  378.             score: float = tokenizer.get_score(i)
  379.  
  380.             toktype = gguf.TokenType.NORMAL
  381.             if tokenizer.is_unknown(i):
  382.                 toktype = gguf.TokenType.UNKNOWN
  383.             if tokenizer.is_control(i):
  384.                 toktype = gguf.TokenType.CONTROL
  385.  
  386.             # NOTE: I think added_tokens are user defined.
  387.             # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
  388.             # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
  389.  
  390.             if tokenizer.is_unused(i):
  391.                 toktype = gguf.TokenType.UNUSED
  392.             if tokenizer.is_byte(i):
  393.                 toktype = gguf.TokenType.BYTE
  394.  
  395.             yield text, score, toktype
  396.  
  397.     def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
  398.         for text in self.added_tokens_list:
  399.             score = -1000.0
  400.             yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
  401.  
  402.     def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
  403.         yield from self.sentencepiece_tokens()
  404.         yield from self.added_tokens()
  405.  
  406.     def __repr__(self) -> str:
  407.         return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
  408.  
  409. Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
  410.  
  411. #
  412. # data loading
  413. # TODO: reuse (probably move to gguf.py?)
  414. #
  415.  
  416. def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
  417.     #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
  418.     if n_head_kv is not None and n_head != n_head_kv:
  419.         n_head = n_head_kv
  420.     return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
  421.                 .swapaxes(1, 2)
  422.                 .reshape(weights.shape))
  423.  
  424.  
  425. class Tensor(metaclass=ABCMeta):
  426.     data_type: DataType
  427.  
  428.     @abstractmethod
  429.     def astype(self, data_type: DataType) -> Tensor: ...
  430.     @abstractmethod
  431.     def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
  432.     @abstractmethod
  433.     def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
  434.     @abstractmethod
  435.     def part(self, n_part: int) -> UnquantizedTensor: ...
  436.     @abstractmethod
  437.     def to_ggml(self) -> GGMLCompatibleTensor: ...
  438.  
  439.  
  440. def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
  441.     assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
  442.     fp32_arr = bf16_arr.astype(np.uint32) << 16
  443.     return fp32_arr.view(np.float32)
  444.  
  445.  
  446. class UnquantizedTensor(Tensor):
  447.     def __init__(self, ndarray: NDArray) -> None:
  448.         assert isinstance(ndarray, np.ndarray)
  449.         self.ndarray = ndarray
  450.         self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
  451.  
  452.     def astype(self, data_type: DataType) -> Tensor:
  453.         dtype = data_type.dtype
  454.         if self.data_type == DT_BF16:
  455.             self.ndarray = bf16_to_fp32(self.ndarray)
  456.         return UnquantizedTensor(self.ndarray.astype(dtype))
  457.  
  458.     def to_ggml(self) -> UnquantizedTensor:
  459.         return self
  460.  
  461.     def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
  462.         r = self.ndarray.shape[0] // 3
  463.         return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
  464.  
  465.     def part(self, n_part: int) -> UnquantizedTensor:
  466.         r = self.ndarray.shape[0] // 3
  467.         return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
  468.  
  469.     def permute(self, n_head: int, n_head_kv: int) -> UnquantizedTensor:
  470.         return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv))
  471.  
  472.  
  473. def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray:
  474.     tensor = lazy_tensor.load()
  475.     assert isinstance(tensor, UnquantizedTensor)
  476.  
  477.     # double-check:
  478.     actual_shape = list(tensor.ndarray.shape)
  479.     assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape)
  480.     if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype:
  481.         if convert:
  482.             tensor.ndarray = tensor.ndarray.astype(expected_dtype)
  483.         else:
  484.             raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}')
  485.  
  486.     return tensor.ndarray
  487.  
  488.  
  489. GGMLCompatibleTensor = UnquantizedTensor
  490.  
  491.  
  492. @dataclass
  493. class LazyTensor:
  494.     _load: Callable[[], Tensor]
  495.     shape: list[int]
  496.     data_type: DataType
  497.     description: str
  498.  
  499.     def load(self) -> Tensor:
  500.         ret = self._load()
  501.         # Should be okay if it maps to the same numpy type?
  502.         assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
  503.                 (self.data_type, ret.data_type, self.description)
  504.         return ret
  505.  
  506.     def astype(self, data_type: DataType) -> LazyTensor:
  507.         self.validate_conversion_to(data_type)
  508.  
  509.         def load() -> Tensor:
  510.             return self.load().astype(data_type)
  511.         return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
  512.  
  513.     def validate_conversion_to(self, data_type: DataType) -> None:
  514.         if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions:
  515.             raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')
  516.  
  517.  
  518. LazyModel: TypeAlias = 'dict[str, LazyTensor]'
  519.  
  520.  
  521. @dataclass
  522. class ModelPlus:
  523.     model: LazyModel
  524.     paths: list[Path]  # Where this was read from.
  525.     format: Literal['ggml', 'torch', 'safetensors', 'none']
  526.     vocab: Vocab | None  # For GGML models (which have vocab built in), the vocab.
  527.  
  528.  
  529. def merge_sharded(models: list[LazyModel]) -> LazyModel:
  530.     # Original LLaMA models have each file contain one part of each tensor.
  531.     # Use a dict instead of a set to preserve order.
  532.     names = {name: None for model in models for name in model}
  533.  
  534.     def convert(name: str) -> LazyTensor:
  535.         lazy_tensors: list[LazyTensor] = [model[name] for model in models]
  536.         if len(lazy_tensors) == 1:
  537.             # only one file; don't go through this procedure since there might
  538.             # be quantized tensors
  539.             return lazy_tensors[0]
  540.         if len(lazy_tensors[0].shape) == 1:
  541.             # the tensor is just duplicated in every file
  542.             return lazy_tensors[0]
  543.         if name.startswith('tok_embeddings.') or \
  544.            name.endswith('.attention.wo.weight') or \
  545.            name.endswith('.feed_forward.w2.weight'):
  546.             # split by columns
  547.             axis = 1
  548.         else:
  549.             # split by rows
  550.             axis = 0
  551.         concatenated_shape = list(lazy_tensors[0].shape)
  552.         concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
  553.  
  554.         def load() -> UnquantizedTensor:
  555.             ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
  556.             concatenated: NDArray = np.concatenate(ndarrays, axis=axis)
  557.             return UnquantizedTensor(concatenated)
  558.         description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
  559.         return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
  560.     return {name: convert(name) for name in names}
  561.  
  562.  
  563. def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
  564.     formats = set(mp.format for mp in models_plus)
  565.     assert len(formats) == 1, "different formats?"
  566.     format = formats.pop()
  567.     paths = [path for mp in models_plus for path in mp.paths]
  568.     # Use the first non-None vocab, if any.
  569.     try:
  570.         vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None)
  571.     except StopIteration:
  572.         vocab = None
  573.  
  574.     if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
  575.         # Transformers models put different tensors in different files, but
  576.         # don't split indivdual tensors between files.
  577.         model: LazyModel = {}
  578.         for mp in models_plus:
  579.             model.update(mp.model)
  580.     else:
  581.         model = merge_sharded([mp.model for mp in models_plus])
  582.  
  583.     return ModelPlus(model, paths, format, vocab)
  584.  
  585.  
  586. def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
  587.     def load() -> Tensor:
  588.         return lazy_tensor.load().permute(n_head, n_head_kv)
  589.     return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
  590.  
  591. def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
  592.     def load() -> Tensor:
  593.         return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
  594.     s = lazy_tensor.shape.copy()
  595.     s[0] = s[0] // 3
  596.     return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
  597.  
  598. def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
  599.     def load() -> Tensor:
  600.         return lazy_tensor.load().part(n_part)
  601.     s = lazy_tensor.shape.copy()
  602.     s[0] = s[0] // 3
  603.     return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
  604.  
  605.  
  606. # Functionality that simulates `torch.load` but where individual tensors are
  607. # only loaded into memory on demand, not all at once.
  608. # PyTorch can't do this natively as of time of writing:
  609. # - https://github.com/pytorch/pytorch/issues/64327
  610. # This allows us to de-shard without multiplying RAM usage, and also
  611. # conveniently drops the PyTorch dependency (though we still need numpy).
  612.  
  613.  
  614. @dataclass
  615. class LazyStorageKind:
  616.     data_type: DataType
  617.  
  618.  
  619. @dataclass
  620. class LazyStorage:
  621.     load: Callable[[int, int], NDArray]
  622.     kind: LazyStorageKind
  623.     description: str
  624.  
  625.  
  626. class LazyUnpickler(pickle.Unpickler):
  627.     def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
  628.         super().__init__(fp)
  629.         self.data_base_path = data_base_path
  630.         self.zip_file = zip_file
  631.  
  632.     def persistent_load(self, pid: Any) -> Any:
  633.         assert pid[0] == 'storage'
  634.         assert isinstance(pid[1], LazyStorageKind)
  635.         data_type = pid[1].data_type
  636.         filename_stem = pid[2]
  637.         filename = f'{self.data_base_path}/{filename_stem}'
  638.         info = self.zip_file.getinfo(filename)
  639.  
  640.         def load(offset: int, elm_count: int) -> NDArray:
  641.             dtype = data_type.dtype
  642.             fp = self.zip_file.open(info)
  643.             fp.seek(offset * dtype.itemsize)
  644.             size = elm_count * dtype.itemsize
  645.             data = fp.read(size)
  646.             assert len(data) == size
  647.             return np.frombuffer(data, dtype)
  648.         description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
  649.         return LazyStorage(load=load, kind=pid[1], description=description)
  650.  
  651.     @staticmethod
  652.     def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
  653.                                requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
  654.         assert isinstance(storage, LazyStorage)
  655.  
  656.         def load() -> UnquantizedTensor:
  657.             elm_count = stride[0] * size[0]
  658.             return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
  659.         description = f'pickled storage_offset={storage_offset} in {storage.description}'
  660.         return LazyTensor(load, list(size), storage.kind.data_type, description)
  661.  
  662.     @staticmethod
  663.     def rebuild_from_type_v2(func, new_type, args, state):
  664.         return func(*args)
  665.  
  666.     CLASSES: dict[tuple[str, str], Any] = {
  667.         # getattr used here as a workaround for mypy not being smart enough to detrmine
  668.         # the staticmethods have a __func__ attribute.
  669.         ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
  670.         ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
  671.         ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
  672.         ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
  673.         ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
  674.         ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
  675.         ('torch', 'Tensor'): LazyTensor,
  676.     }
  677.  
  678.     def find_class(self, module: str, name: str) -> Any:
  679.         if not module.startswith('torch'):
  680.             return super().find_class(module, name)
  681.         return self.CLASSES[(module, name)]
  682.  
  683.  
  684. def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
  685.     zf = zipfile.ZipFile(outer_fp)
  686.     pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
  687.     assert len(pickle_paths) == 1, pickle_paths
  688.     pickle_fp = zf.open(pickle_paths[0], 'r')
  689.     unpickler = LazyUnpickler(pickle_fp,
  690.                               data_base_path=pickle_paths[0][:-4],
  691.                               zip_file=zf)
  692.     model = unpickler.load()
  693.     if 'model' in model: model = model['model']
  694.     as_dict = dict(model.items())
  695.     return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
  696.  
  697.  
  698. def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
  699.     header_size, = struct.unpack('<Q', fp.read(8))
  700.     header: dict[str, dict[str, Any]] = json.loads(fp.read(header_size))
  701.     # Use mmap for the actual data to avoid race conditions with the file offset.
  702.     mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
  703.     byte_buf = mapped[8 + header_size:]
  704.  
  705.     def convert(info: dict[str, Any]) -> LazyTensor:
  706.         data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
  707.         numpy_dtype = data_type.dtype
  708.         shape: list[int] = info['shape']
  709.         begin, end = info['data_offsets']
  710.         assert 0 <= begin <= end <= len(byte_buf)
  711.         assert end - begin == math.prod(shape) * numpy_dtype.itemsize
  712.         buf = byte_buf[begin:end]
  713.  
  714.         def load() -> UnquantizedTensor:
  715.             return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
  716.         description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
  717.         return LazyTensor(load, shape, data_type, description)
  718.     model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'}
  719.     return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
  720.  
  721.  
  722. def must_read(fp: IO[bytes], length: int) -> bytes:
  723.     ret = fp.read(length)
  724.     if len(ret) < length:
  725.         raise Exception("unexpectedly reached end of file")
  726.     return ret
  727.  
  728.  
  729. @functools.lru_cache(maxsize=None)
  730. def lazy_load_file(path: Path) -> ModelPlus:
  731.     fp = open(path, 'rb')
  732.     first8 = fp.read(8)
  733.     fp.seek(0)
  734.     if first8[:2] == b'PK':
  735.         # A zip file, i.e. PyTorch format
  736.         return lazy_load_torch_file(fp, path)
  737.     elif struct.unpack('<Q', first8)[0] < 16 * 1024 * 1024:
  738.         # Probably safetensors
  739.         return lazy_load_safetensors_file(fp, path)
  740.     else:
  741.         raise ValueError(f"unknown format: {path}")
  742.  
  743.  
  744. In = TypeVar('In')
  745. Out = TypeVar('Out')
  746.  
  747. def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
  748.     '''Parallel map, but with backpressure.  If the caller doesn't call `next`
  749.    fast enough, this will stop calling `func` at some point rather than
  750.    letting results pile up in memory.  Specifically, there is a max of one
  751.    output value buffered per thread.'''
  752.     if concurrency < 2:
  753.         yield from map(func, iterable)
  754.         # Not reached.
  755.     iterable = iter(iterable)
  756.     executor_class: type[ThreadPoolExecutor] | type[ProcessPoolExecutor]
  757.     if use_processpool_executor:
  758.         executor_class = ProcessPoolExecutor
  759.     else:
  760.         executor_class = ThreadPoolExecutor
  761.     with executor_class(max_workers = max_workers) as executor:
  762.         futures: list[concurrent.futures.Future[Out]] = []
  763.         done = False
  764.         for _ in range(concurrency):
  765.             try:
  766.                 futures.append(executor.submit(func, next(iterable)))
  767.             except StopIteration:
  768.                 done = True
  769.                 break
  770.  
  771.         while futures:
  772.             result = futures.pop(0).result()
  773.             while not done and len(futures) < concurrency:
  774.                 try:
  775.                     futures.append(executor.submit(func, next(iterable)))
  776.                 except StopIteration:
  777.                     done = True
  778.                     break
  779.             yield result
  780.  
  781. def check_vocab_size(params: Params, vocab: Vocab) -> None:
  782.     if params.n_vocab != vocab.vocab_size:
  783.         assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
  784.         if params.n_vocab == vocab.vocab_size_base:
  785.             print("Ignoring added_tokens.json since model matches vocab size without it.")
  786.             vocab.added_tokens_list = []
  787.             vocab.vocab_size = vocab.vocab_size_base
  788.             return
  789.         msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
  790.         if vocab.fname_added_tokens is not None:
  791.             msg += f" combined with {vocab.fname_added_tokens}"
  792.         msg += f" has {vocab.vocab_size})."
  793.         if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
  794.             msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
  795.         raise Exception(msg)
  796.  
  797.  
  798. class OutputFile:
  799.     def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
  800.         self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
  801.  
  802.     def add_meta_arch(self, params: Params) -> None:
  803.         name = "LLaMA"
  804.  
  805.         # TODO: better logic to determine model name
  806.         if params.n_ctx == 4096:
  807.             name = "LLaMA v2"
  808.         elif params.path_model is not None:
  809.             name = str(params.path_model.parent).split('/')[-1]
  810.  
  811.         self.gguf.add_name                (name)
  812.         self.gguf.add_context_length      (params.n_ctx)
  813.         self.gguf.add_embedding_length    (params.n_embd)
  814.         self.gguf.add_block_count         (params.n_layer)
  815.         self.gguf.add_feed_forward_length (params.n_ff)
  816.         self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
  817.         self.gguf.add_head_count          (params.n_head)
  818.         self.gguf.add_head_count_kv       (params.n_head_kv)
  819.         self.gguf.add_layer_norm_rms_eps  (params.f_norm_eps)
  820.  
  821.         if params.f_rope_freq_base is not None:
  822.             self.gguf.add_rope_freq_base(params.f_rope_freq_base)
  823.  
  824.         if params.rope_scaling_type:
  825.             assert params.f_rope_scale is not None
  826.             self.gguf.add_rope_scaling_type(params.rope_scaling_type)
  827.             self.gguf.add_rope_scaling_factor(params.f_rope_scale)
  828.  
  829.         if params.n_orig_ctx is not None:
  830.             self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
  831.  
  832.         if params.rope_finetuned is not None:
  833.             self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
  834.  
  835.         if params.ftype is not None:
  836.             self.gguf.add_file_type(params.ftype)
  837.  
  838.     def add_meta_vocab(self, vocab: Vocab) -> None:
  839.         tokens = []
  840.         scores = []
  841.         toktypes = []
  842.         # NOTE: `all_tokens` returns the base vocabulary and added tokens
  843.         for text, score, toktype in vocab.all_tokens():
  844.             tokens.append(text)
  845.             scores.append(score)
  846.             toktypes.append(toktype)
  847.  
  848.         if isinstance(vocab, SentencePieceVocab):
  849.             self.gguf.add_tokenizer_model("llama")
  850.         elif isinstance(vocab, BpeVocab):
  851.             self.gguf.add_tokenizer_model("gpt2")
  852.         else:
  853.             raise ValueError('Unknown vocab type: Not BpeVocab or SentencePieceVocab')
  854.         self.gguf.add_token_list(tokens)
  855.         self.gguf.add_token_scores(scores)
  856.         self.gguf.add_token_types(toktypes)
  857.  
  858.     def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None:
  859.         svocab.add_to_gguf(self.gguf)
  860.  
  861.     def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
  862.         n_elements = int(np.prod(tensor.shape))
  863.         raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
  864.         data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
  865.         data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
  866.         self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype = raw_dtype)
  867.  
  868.     def write_meta(self) -> None:
  869.         self.gguf.write_header_to_file()
  870.         self.gguf.write_kv_data_to_file()
  871.  
  872.     def write_tensor_info(self) -> None:
  873.         self.gguf.write_ti_data_to_file()
  874.  
  875.     def close(self) -> None:
  876.         self.gguf.close()
  877.  
  878.     @staticmethod
  879.     def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
  880.         check_vocab_size(params, vocab)
  881.  
  882.         of = OutputFile(fname_out, endianess=endianess)
  883.  
  884.         # meta data
  885.         of.add_meta_arch(params)
  886.         of.add_meta_vocab(vocab)
  887.         of.add_meta_special_vocab(svocab)
  888.  
  889.         of.write_meta()
  890.  
  891.         of.close()
  892.  
  893.     @staticmethod
  894.     def do_item(item: tuple[str, LazyTensor]) -> tuple[DataType, NDArray]:
  895.         name, lazy_tensor = item
  896.         tensor = lazy_tensor.load().to_ggml()
  897.         return (lazy_tensor.data_type, tensor.ndarray)
  898.  
  899.     @staticmethod
  900.     def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
  901.         dt, arr = item
  902.         if not isinstance(dt, QuantizedDataType):
  903.             return arr
  904.         return dt.quantize(arr)
  905.  
  906.     @staticmethod
  907.     def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
  908.         check_vocab_size(params, vocab)
  909.  
  910.         of = OutputFile(fname_out, endianess=endianess)
  911.  
  912.         # meta data
  913.         of.add_meta_arch(params)
  914.         of.add_meta_vocab(vocab)
  915.         of.add_meta_special_vocab(svocab)
  916.  
  917.         # tensor info
  918.         for name, lazy_tensor in model.items():
  919.             of.add_tensor_info(name, lazy_tensor)
  920.  
  921.         of.write_meta()
  922.         of.write_tensor_info()
  923.  
  924.         # tensor data
  925.         ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
  926.         if ftype == GGMLFileType.MostlyQ8_0:
  927.             ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, use_processpool_executor = True)
  928.         else:
  929.             ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
  930.  
  931.         start = time.time()
  932.         for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
  933.             elapsed = time.time() - start
  934.             size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
  935.             padi = len(str(len(model)))
  936.             print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}")
  937.             of.gguf.write_tensor_data(ndarray)
  938.  
  939.         of.close()
  940.  
  941. def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
  942.     wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
  943.  
  944.     if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
  945.         return GGMLFileType.AllF32
  946.     if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
  947.         return GGMLFileType.MostlyF16
  948.     if output_type_str == "q8_0":
  949.         return GGMLFileType.MostlyQ8_0
  950.  
  951.     name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
  952.  
  953.     raise Exception(f"Unexpected combination of types: {name_to_type}")
  954.  
  955. def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
  956.     return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
  957.             for (name, tensor) in model.items()}
  958.  
  959. def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
  960.     tmap = gguf.TensorNameMap(ARCH, params.n_layer)
  961.     should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
  962.  
  963.     tmp = model
  964.  
  965.     # HF models permut or pack some of the tensors, so we need to undo that
  966.     for i in itertools.count():
  967.         if f"model.layers.{i}.self_attn.q_proj.weight" in model:
  968.             print(f"Permuting layer {i}")
  969.             tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
  970.             tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
  971.            #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] =              model[f"model.layers.{i}.self_attn.v_proj.weight"]
  972.         elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
  973.             print(f"Unpacking and permuting layer {i}")
  974.             tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
  975.             tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
  976.             tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy        (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
  977.             del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
  978.         else:
  979.             break
  980.  
  981.     out: LazyModel = {}
  982.     for name, lazy_tensor in model.items():
  983.         tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
  984.         if name_new is None:
  985.             # raise Exception(f"Unexpected tensor name: {name}")
  986.             continue
  987.  
  988.         if tensor_type in should_skip:
  989.             print(f"skipping tensor {name_new}")
  990.             continue
  991.  
  992.         print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
  993.         out[name_new] = lazy_tensor
  994.  
  995.     return out
  996.  
  997. def nth_multifile_path(path: Path, n: int) -> Path | None:
  998.     '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
  999.    the nth path in the model.
  1000.    '''
  1001.     # Support the following patterns:
  1002.     patterns: list[tuple[str, str]] = [
  1003.         # - x.00.pth, x.01.pth, etc.
  1004.         (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
  1005.         # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
  1006.         (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'),
  1007.         # x.bin, x.bin.1, etc.
  1008.         (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}')
  1009.     ]
  1010.     for regex, replacement in patterns:
  1011.         if re.search(regex, path.name):
  1012.             new_path = path.with_name(re.sub(regex, replacement, path.name))
  1013.             if new_path.exists():
  1014.                 return new_path
  1015.     return None
  1016.  
  1017.  
  1018. def find_multifile_paths(path: Path) -> list[Path]:
  1019.     '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
  1020.    the whole list of paths in the model.
  1021.    '''
  1022.     ret: list[Path] = []
  1023.     for i in itertools.count():
  1024.         nth_path = nth_multifile_path(path, i)
  1025.         if nth_path is None:
  1026.             break
  1027.         ret.append(nth_path)
  1028.     if not ret:
  1029.         # No matches.  This should only happen if the file was named, e.g.,
  1030.         # foo.0, and there was no file named foo.  Oh well, try to process it
  1031.         # as a single file.
  1032.         return [path]
  1033.     return ret
  1034.  
  1035.  
  1036. def load_some_model(path: Path) -> ModelPlus:
  1037.     '''Load a model of any supported format.'''
  1038.     # Be extra-friendly and accept either a file or a directory:
  1039.     if path.is_dir():
  1040.         # Check if it's a set of safetensors files first
  1041.         globs = ["model-00001-of-*.safetensors", "model.safetensors"]
  1042.         files = [file for glob in globs for file in path.glob(glob)]
  1043.         if not files:
  1044.             # Try the PyTorch patterns too, with lower priority
  1045.             globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
  1046.             files = [file for glob in globs for file in path.glob(glob)]
  1047.         if not files:
  1048.             raise Exception(f"Can't find model in directory {path}")
  1049.         if len(files) > 1:
  1050.             raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}")
  1051.         path = files[0]
  1052.  
  1053.     paths = find_multifile_paths(path)
  1054.     models_plus: list[ModelPlus] = []
  1055.     for path in paths:
  1056.         print(f"Loading model file {path}")
  1057.         models_plus.append(lazy_load_file(path))
  1058.  
  1059.     model_plus = merge_multifile_models(models_plus)
  1060.     return model_plus
  1061.  
  1062.  
  1063. def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
  1064.     # Be extra-friendly and accept either a file or a directory.  Also, if it's
  1065.     # a directory, it might be the model directory, and tokenizer.model might
  1066.     # be in the parent of that.
  1067.     if path.is_dir():
  1068.         vocab_file = "tokenizer.model"
  1069.         if vocabtype == 'bpe':
  1070.             vocab_file = "vocab.json"
  1071.         path2 = path / vocab_file
  1072.         # Use `.parent` instead of /.. to handle the symlink case better.
  1073.         path3 = path.parent / vocab_file
  1074.         if path2.exists():
  1075.             path = path2
  1076.         elif path3.exists():
  1077.             path = path3
  1078.         else:
  1079.             raise FileNotFoundError(
  1080.                 f"Could not find {vocab_file} in {path} or its parent; "
  1081.                 "if it's in another directory, pass the directory as --vocab-dir")
  1082.  
  1083.     print(f"Loading vocab file '{path}', type '{vocabtype}'")
  1084.  
  1085.     added_tokens_path = path.parent / "added_tokens.json"
  1086.     if vocabtype == "bpe":
  1087.         return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
  1088.     elif vocabtype == "spm":
  1089.         return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
  1090.     else:
  1091.         raise ValueError(f"Unsupported vocabulary type {vocabtype}")
  1092.  
  1093.  
  1094. def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
  1095.     namestr = {
  1096.         GGMLFileType.AllF32:    "f32",
  1097.         GGMLFileType.MostlyF16: "f16",
  1098.         GGMLFileType.MostlyQ8_0:"q8_0",
  1099.     }[file_type]
  1100.     ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
  1101.     if ret in model_paths:
  1102.         sys.stderr.write(
  1103.             f"Error: Default output path ({ret}) would overwrite the input. "
  1104.             "Please explicitly specify a path using --outfile.\n")
  1105.         sys.exit(1)
  1106.     return ret
  1107.  
  1108.  
  1109. def do_dump_model(model_plus: ModelPlus) -> None:
  1110.     print(f"model_plus.paths = {model_plus.paths!r}")
  1111.     print(f"model_plus.format = {model_plus.format!r}")
  1112.     print(f"model_plus.vocab = {model_plus.vocab!r}")
  1113.     for name, lazy_tensor in model_plus.model.items():
  1114.         print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
  1115.  
  1116.  
  1117. def main(args_in: list[str] | None = None) -> None:
  1118.     output_choices = ["f32", "f16"]
  1119.     if np.uint32(1) == np.uint32(1).newbyteorder("<"):
  1120.         # We currently only support Q8_0 output on little endian systems.
  1121.         output_choices.append("q8_0")
  1122.     parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
  1123.     parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
  1124.     parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
  1125.     parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
  1126.     parser.add_argument("--outtype",     choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
  1127.     parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
  1128.     parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
  1129.     parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin, *.safetensors)")
  1130.     parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
  1131.     parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
  1132.     parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
  1133.     parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
  1134.  
  1135.     args = parser.parse_args(args_in)
  1136.     if args.dump_single:
  1137.         model_plus = lazy_load_file(args.model)
  1138.         do_dump_model(model_plus)
  1139.         return
  1140.  
  1141.     if not args.vocab_only:
  1142.         model_plus = load_some_model(args.model)
  1143.     else:
  1144.         model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
  1145.  
  1146.     if args.dump:
  1147.         do_dump_model(model_plus)
  1148.         return
  1149.     endianess = gguf.GGUFEndian.LITTLE
  1150.     if args.bigendian:
  1151.         endianess = gguf.GGUFEndian.BIG
  1152.  
  1153.     params = Params.load(model_plus)
  1154.     if params.n_ctx == -1:
  1155.         if args.ctx is None:
  1156.             raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
  1157.                             "Please specify one with --ctx:\n"
  1158.                             " - LLaMA v1: --ctx 2048\n"
  1159.                             " - LLaMA v2: --ctx 4096\n")
  1160.         params.n_ctx = args.ctx
  1161.  
  1162.     if args.outtype:
  1163.         params.ftype = {
  1164.             "f32": GGMLFileType.AllF32,
  1165.             "f16": GGMLFileType.MostlyF16,
  1166.             "q8_0": GGMLFileType.MostlyQ8_0,
  1167.         }[args.outtype]
  1168.  
  1169.     print(f"params = {params}")
  1170.  
  1171.     vocab: Vocab
  1172.     if args.vocab_only:
  1173.         if not args.outfile:
  1174.             raise ValueError("need --outfile if using --vocab-only")
  1175.         # FIXME: Try to respect vocab_dir somehow?
  1176.         vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
  1177.         special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
  1178.             load_merges = args.vocabtype == 'bpe',
  1179.             n_vocab = vocab.vocab_size)
  1180.         outfile = args.outfile
  1181.         OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
  1182.         print(f"Wrote {outfile}")
  1183.         return
  1184.  
  1185.     if model_plus.vocab is not None and args.vocab_dir is None:
  1186.         vocab = model_plus.vocab
  1187.     else:
  1188.         vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
  1189.         vocab = load_vocab(vocab_dir, args.vocabtype)
  1190.     # FIXME: Try to respect vocab_dir somehow?
  1191.     special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
  1192.         load_merges = args.vocabtype == 'bpe',
  1193.         n_vocab = vocab.vocab_size)
  1194.  
  1195.     model   = model_plus.model
  1196.     model   = convert_model_names(model, params)
  1197.     ftype   = pick_output_type(model, args.outtype)
  1198.     model   = convert_to_output_type(model, ftype)
  1199.     outfile = args.outfile or default_outfile(model_plus.paths, ftype)
  1200.  
  1201.     params.ftype = ftype
  1202.     print(f"Writing {outfile}, format {ftype}")
  1203.  
  1204.     OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
  1205.     print(f"Wrote {outfile}")
  1206.  
  1207.  
  1208. if __name__ == '__main__':
  1209.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement