Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- taskset -c 0-63 /home/lissanro/pkgs/ik_llama.cpp/build/bin/llama-server --model /home/lissanro/neuro/DeepSeek-V3-0324-GGUF-UD-Q4_K_R4-163840seq/DeepSeek-V3-0324-GGUF-UD-Q4_K_R4.gguf --ctx-size 81920 --n-gpu-layers 62 --tensor-split 25,25,25,25 -mla 2 -fa -ctk q8_0 -amb 2048 -fmoe --override-tensor "ffn_down_exps=CPU, ffn_up_exps=CPU, gate_exps=CPU" --threads 64 --host 0.0.0.0 --port 5000
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 4 CUDA devices:
- Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
- Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
- Device 2: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
- Device 3: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
- INFO [ main] build info | tid="140708181868544" timestamp=1744369384 build=3630 commit="5f44f4b3"
- INFO [ main] system info | tid="140708181868544" timestamp=1744369384 n_threads=64 n_threads_batch=-1 total_threads=128 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | "
- llama_model_loader: loaded meta data with 46 key-value pairs and 1025 tensors from /home/lissanro/neuro/DeepSeek-V3-0324-GGUF-UD-Q4_K_R4-163840seq/DeepSeek-V3-0324-GGUF-UD-Q4_K_R4.gguf (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = deepseek2
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = DeepSeek V3 0324 BF16
- llama_model_loader: - kv 3: general.quantized_by str = Unsloth
- llama_model_loader: - kv 4: general.size_label str = 256x20B
- llama_model_loader: - kv 5: general.license str = mit
- llama_model_loader: - kv 6: general.repo_url str = https://huggingface.co/unsloth
- llama_model_loader: - kv 7: deepseek2.block_count u32 = 61
- llama_model_loader: - kv 8: deepseek2.context_length u32 = 163840
- llama_model_loader: - kv 9: deepseek2.embedding_length u32 = 7168
- llama_model_loader: - kv 10: deepseek2.feed_forward_length u32 = 18432
- llama_model_loader: - kv 11: deepseek2.attention.head_count u32 = 128
- llama_model_loader: - kv 12: deepseek2.attention.head_count_kv u32 = 128
- llama_model_loader: - kv 13: deepseek2.rope.freq_base f32 = 10000.000000
- llama_model_loader: - kv 14: deepseek2.attention.layer_norm_rms_epsilon f32 = 0.000001
- llama_model_loader: - kv 15: deepseek2.expert_used_count u32 = 8
- llama_model_loader: - kv 16: deepseek2.leading_dense_block_count u32 = 3
- llama_model_loader: - kv 17: deepseek2.vocab_size u32 = 129280
- llama_model_loader: - kv 18: deepseek2.attention.q_lora_rank u32 = 1536
- llama_model_loader: - kv 19: deepseek2.attention.kv_lora_rank u32 = 512
- llama_model_loader: - kv 20: deepseek2.attention.key_length u32 = 192
- llama_model_loader: - kv 21: deepseek2.attention.value_length u32 = 128
- llama_model_loader: - kv 22: deepseek2.expert_feed_forward_length u32 = 2048
- llama_model_loader: - kv 23: deepseek2.expert_count u32 = 256
- llama_model_loader: - kv 24: deepseek2.expert_shared_count u32 = 1
- llama_model_loader: - kv 25: deepseek2.expert_weights_scale f32 = 2.500000
- llama_model_loader: - kv 26: deepseek2.expert_weights_norm bool = true
- llama_model_loader: - kv 27: deepseek2.expert_gating_func u32 = 2
- llama_model_loader: - kv 28: deepseek2.rope.dimension_count u32 = 64
- llama_model_loader: - kv 29: deepseek2.rope.scaling.type str = yarn
- llama_model_loader: - kv 30: deepseek2.rope.scaling.factor f32 = 40.000000
- llama_model_loader: - kv 31: deepseek2.rope.scaling.original_context_length u32 = 4096
- llama_model_loader: - kv 32: deepseek2.rope.scaling.yarn_log_multiplier f32 = 0.100000
- llama_model_loader: - kv 33: tokenizer.ggml.model str = gpt2
- llama_model_loader: - kv 34: tokenizer.ggml.pre str = deepseek-v3
- llama_model_loader: - kv 35: tokenizer.ggml.tokens arr[str,129280] = ["<|begin▁of▁sentence|>", "<�...
- llama_model_loader: - kv 36: tokenizer.ggml.token_type arr[i32,129280] = [3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
- llama_model_loader: - kv 37: tokenizer.ggml.merges arr[str,127741] = ["Ġ t", "Ġ a", "i n", "Ġ Ġ", "h e...
- llama_model_loader: - kv 38: tokenizer.ggml.bos_token_id u32 = 0
- llama_model_loader: - kv 39: tokenizer.ggml.eos_token_id u32 = 1
- llama_model_loader: - kv 40: tokenizer.ggml.padding_token_id u32 = 1
- llama_model_loader: - kv 41: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 42: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 43: tokenizer.chat_template str = {% if not add_generation_prompt is de...
- llama_model_loader: - kv 44: general.quantization_version u32 = 2
- llama_model_loader: - kv 45: general.file_type u32 = 214
- llama_model_loader: - type f32: 361 tensors
- llama_model_loader: - type q4_K: 306 tensors
- llama_model_loader: - type q6_K: 184 tensors
- llama_model_loader: - type q4_k_r4: 147 tensors
- llama_model_loader: - type q6_k_r4: 27 tensors
- llm_load_vocab: special tokens cache size = 818
- llm_load_vocab: token to piece cache size = 0.8223 MB
- llm_load_print_meta: format = GGUF V3 (latest)
- llm_load_print_meta: arch = deepseek2
- llm_load_print_meta: vocab type = BPE
- llm_load_print_meta: n_vocab = 129280
- llm_load_print_meta: n_merges = 127741
- llm_load_print_meta: vocab_only = 0
- llm_load_print_meta: n_ctx_train = 163840
- llm_load_print_meta: n_embd = 7168
- llm_load_print_meta: n_layer = 61
- llm_load_print_meta: n_head = 128
- llm_load_print_meta: n_head_kv = 128
- llm_load_print_meta: n_rot = 64
- llm_load_print_meta: n_swa = 0
- llm_load_print_meta: n_embd_head_k = 192
- llm_load_print_meta: n_embd_head_v = 128
- llm_load_print_meta: n_gqa = 1
- llm_load_print_meta: n_embd_k_gqa = 24576
- llm_load_print_meta: n_embd_v_gqa = 16384
- llm_load_print_meta: f_norm_eps = 0.0e+00
- llm_load_print_meta: f_norm_rms_eps = 1.0e-06
- llm_load_print_meta: f_clamp_kqv = 0.0e+00
- llm_load_print_meta: f_max_alibi_bias = 0.0e+00
- llm_load_print_meta: f_logit_scale = 0.0e+00
- llm_load_print_meta: n_ff = 18432
- llm_load_print_meta: n_expert = 256
- llm_load_print_meta: n_expert_used = 8
- llm_load_print_meta: causal attn = 1
- llm_load_print_meta: pooling type = 0
- llm_load_print_meta: rope type = 0
- llm_load_print_meta: rope scaling = yarn
- llm_load_print_meta: freq_base_train = 10000.0
- llm_load_print_meta: freq_scale_train = 0.025
- llm_load_print_meta: n_ctx_orig_yarn = 4096
- llm_load_print_meta: rope_finetuned = unknown
- llm_load_print_meta: ssm_d_conv = 0
- llm_load_print_meta: ssm_d_inner = 0
- llm_load_print_meta: ssm_d_state = 0
- llm_load_print_meta: ssm_dt_rank = 0
- llm_load_print_meta: model type = 671B
- llm_load_print_meta: model ftype = Q4_K_R4
- llm_load_print_meta: model params = 671.026 B
- llm_load_print_meta: model size = 377.065 GiB (4.827 BPW)
- llm_load_print_meta: repeating layers = 375.872 GiB (4.825 BPW, 669.173 B parameters)
- llm_load_print_meta: general.name = DeepSeek V3 0324 BF16
- llm_load_print_meta: BOS token = 0 '<|begin▁of▁sentence|>'
- llm_load_print_meta: EOS token = 1 '<|end▁of▁sentence|>'
- llm_load_print_meta: PAD token = 1 '<|end▁of▁sentence|>'
- llm_load_print_meta: LF token = 131 'Ä'
- llm_load_print_meta: max token length = 256
- llm_load_print_meta: n_layer_dense_lead = 3
- llm_load_print_meta: n_lora_q = 1536
- llm_load_print_meta: n_lora_kv = 512
- llm_load_print_meta: n_ff_exp = 2048
- llm_load_print_meta: n_expert_shared = 1
- llm_load_print_meta: expert_weights_scale = 2.5
- llm_load_print_meta: expert_weights_norm = 1
- llm_load_print_meta: expert_gating_func = sigmoid
- llm_load_print_meta: rope_yarn_log_mul = 0.1000
- llm_load_tensors: ggml ctx size = 2.12 MiB
- Tensor blk.3.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.3.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.3.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.4.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.4.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.4.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.5.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.5.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.5.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.6.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.6.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.6.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.7.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.7.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.7.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.8.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.8.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.8.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.9.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.9.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.9.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.10.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.10.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.10.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.11.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.11.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.11.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.12.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.12.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.12.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.13.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.13.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.13.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.14.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.14.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.14.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.15.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.15.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.15.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.16.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.16.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.16.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.17.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.17.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.17.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.18.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.18.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.18.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.19.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.19.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.19.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.20.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.20.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.20.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.21.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.21.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.21.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.22.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.22.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.22.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.23.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.23.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.23.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.24.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.24.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.24.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.25.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.25.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.25.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.26.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.26.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.26.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.27.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.27.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.27.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.28.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.28.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.28.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.29.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.29.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.29.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.30.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.30.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.30.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.31.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.31.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.31.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.32.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.32.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.32.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.33.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.33.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.33.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.34.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.34.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.34.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.35.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.35.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.35.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.36.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.36.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.36.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.37.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.37.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.37.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.38.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.38.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.38.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.39.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.39.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.39.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.40.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.40.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.40.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.41.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.41.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.41.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.42.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.42.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.42.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.43.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.43.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.43.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.44.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.44.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.44.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.45.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.45.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.45.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.46.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.46.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.46.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.47.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.47.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.47.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.48.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.48.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.48.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.49.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.49.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.49.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.50.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.50.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.50.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.51.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.51.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.51.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.52.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.52.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.52.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.53.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.53.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.53.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.54.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.54.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.54.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.55.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.55.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.55.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.56.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.56.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.56.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.57.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.57.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.57.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.58.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.58.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.58.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.59.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.59.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.59.ffn_up_exps.weight buffer type overriden to CPU
- Tensor blk.60.ffn_gate_exps.weight buffer type overriden to CPU
- Tensor blk.60.ffn_down_exps.weight buffer type overriden to CPU
- Tensor blk.60.ffn_up_exps.weight buffer type overriden to CPU
- llm_load_tensors: offloading 61 repeating layers to GPU
- llm_load_tensors: offloading non-repeating layers to GPU
- llm_load_tensors: offloaded 62/62 layers to GPU
- llm_load_tensors: CPU buffer size = 383727.50 MiB
- llm_load_tensors: CPU buffer size = 497.11 MiB
- llm_load_tensors: CUDA0 buffer size = 2869.57 MiB
- llm_load_tensors: CUDA1 buffer size = 2097.14 MiB
- llm_load_tensors: CUDA2 buffer size = 2236.95 MiB
- llm_load_tensors: CUDA3 buffer size = 2682.31 MiB
- ....................................................................................................
- ============ llm_load_tensors: need to compute 61 wk_b tensors
- Computed blk.0.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA0
- Computed blk.1.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA0
- Computed blk.2.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA0
- Computed blk.3.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA0
- Computed blk.4.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA0
- Computed blk.5.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA0
- Computed blk.6.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA0
- Computed blk.7.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA0
- Computed blk.8.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA0
- Computed blk.9.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA0
- Computed blk.10.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA0
- Computed blk.11.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA0
- Computed blk.12.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA0
- Computed blk.13.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA0
- Computed blk.14.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA0
- Computed blk.15.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA0
- Computed blk.16.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA1
- Computed blk.17.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA1
- Computed blk.18.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA1
- Computed blk.19.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA1
- Computed blk.20.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA1
- Computed blk.21.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA1
- Computed blk.22.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA1
- Computed blk.23.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA1
- Computed blk.24.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA1
- Computed blk.25.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA1
- Computed blk.26.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA1
- Computed blk.27.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA1
- Computed blk.28.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA1
- Computed blk.29.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA1
- Computed blk.30.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA1
- Computed blk.31.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA2
- Computed blk.32.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA2
- Computed blk.33.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA2
- Computed blk.34.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA2
- Computed blk.35.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA2
- Computed blk.36.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA2
- Computed blk.37.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA2
- Computed blk.38.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA2
- Computed blk.39.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA2
- Computed blk.40.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA2
- Computed blk.41.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA2
- Computed blk.42.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA2
- Computed blk.43.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA2
- Computed blk.44.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA2
- Computed blk.45.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA2
- Computed blk.46.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA2
- Computed blk.47.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA3
- Computed blk.48.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA3
- Computed blk.49.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA3
- Computed blk.50.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA3
- Computed blk.51.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA3
- Computed blk.52.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA3
- Computed blk.53.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA3
- Computed blk.54.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA3
- Computed blk.55.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA3
- Computed blk.56.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA3
- Computed blk.57.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA3
- Computed blk.58.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA3
- Computed blk.59.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA3
- Computed blk.60.attn_v_b.weight as 128 x 512 x 128 and stored in buffer CUDA3
- llama_new_context_with_model: n_ctx = 81920
- llama_new_context_with_model: n_batch = 2048
- llama_new_context_with_model: n_ubatch = 512
- llama_new_context_with_model: flash_attn = 1
- llama_new_context_with_model: mla_attn = 2
- llama_new_context_with_model: attn_max_b = 2048
- llama_new_context_with_model: fused_moe = 1
- llama_new_context_with_model: ser = -1, 0
- llama_new_context_with_model: freq_base = 10000.0
- llama_new_context_with_model: freq_scale = 0.025
- llama_kv_cache_init: layer 0: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 1: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 2: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 3: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 4: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 5: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 6: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 7: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 8: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 9: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 10: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 11: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 12: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 13: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 14: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 15: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 16: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 17: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 18: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 19: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 20: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 21: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 22: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 23: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 24: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 25: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 26: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 27: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 28: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 29: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 30: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 31: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 32: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 33: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 34: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 35: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 36: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 37: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 38: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 39: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 40: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 41: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 42: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 43: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 44: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 45: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 46: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 47: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 48: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 49: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 50: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 51: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 52: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 53: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 54: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 55: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 56: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 57: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 58: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 59: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: layer 60: n_embd_head_qk_rope = 64, kv_lora_rank = 512
- llama_kv_cache_init: CUDA0 KV buffer size = 765.01 MiB
- llama_kv_cache_init: CUDA1 KV buffer size = 717.19 MiB
- llama_kv_cache_init: CUDA2 KV buffer size = 765.01 MiB
- llama_kv_cache_init: CUDA3 KV buffer size = 669.38 MiB
- llama_new_context_with_model: KV self size = 2916.56 MiB, c^KV (q8_0): 2916.56 MiB, kv^T: not used
- llama_new_context_with_model: CUDA_Host output buffer size = 0.99 MiB
- llama_new_context_with_model: pipeline parallelism enabled (n_copies=4)
- llama_new_context_with_model: CUDA0 compute buffer size = 14443.01 MiB
- llama_new_context_with_model: CUDA1 compute buffer size = 14930.04 MiB
- llama_new_context_with_model: CUDA2 compute buffer size = 15378.04 MiB
- llama_new_context_with_model: CUDA3 compute buffer size = 14482.05 MiB
- llama_new_context_with_model: CUDA_Host compute buffer size = 4147.80 MiB
- llama_new_context_with_model: graph nodes = 8245
- llama_new_context_with_model: graph splits = 121
- INFO [ init] initializing slots | tid="140708181868544" timestamp=1744369674 n_slots=1
- INFO [ init] new slot | tid="140708181868544" timestamp=1744369674 id_slot=0 n_ctx_slot=81920
- INFO [ main] model loaded | tid="140708181868544" timestamp=1744369674
- INFO [ main] chat template | tid="140708181868544" timestamp=1744369674 chat_example="You are a helpful assistant\n\n<|User|>Hello<|Assistant|>Hi there<|end▁of▁sentence|><|User|>How are you?<|Assistant|>" built_in=true
- INFO [ main] HTTP server listening | tid="140708181868544" timestamp=1744369674 n_threads_http="127" port="5000" hostname="0.0.0.0"
- INFO [ update_slots] all slots are idle | tid="140708181868544" timestamp=1744369674
- INFO [ launch_slot_with_task] slot is processing task | tid="140708181868544" timestamp=1744369784 id_slot=0 id_task=0
- INFO [ update_slots] kv cache rm [p0, end) | tid="140708181868544" timestamp=1744369784 id_slot=0 id_task=0 p0=0
- INFO [ print_timings] prompt eval time = 9465.24 ms / 572 tokens ( 16.55 ms per token, 60.43 tokens per second) | tid="140708181868544" timestamp=1744369869 id_slot=0 id_task=0 t_prompt_processing=9465.241 n_prompt_tokens_processed=572 t_token=16.547624125874126 n_tokens_second=60.43163613055389
- INFO [ print_timings] generation eval time = 75071.14 ms / 270 runs ( 278.04 ms per token, 3.60 tokens per second) | tid="140708181868544" timestamp=1744369869 id_slot=0 id_task=0 t_token_generation=75071.144 n_decoded=270 t_token=278.04127407407407 n_tokens_second=3.5965883242701087
- INFO [ print_timings] total time = 84536.38 ms | tid="140708181868544" timestamp=1744369869 id_slot=0 id_task=0 t_prompt_processing=9465.241 t_token_generation=75071.144 t_total=84536.385
- INFO [ update_slots] slot released | tid="140708181868544" timestamp=1744369869 id_slot=0 id_task=0 n_ctx=81920 n_past=841 n_system_tokens=0 n_cache_tokens=841 truncated=false
- INFO [ update_slots] all slots are idle | tid="140708181868544" timestamp=1744369869
- INFO [ log_server_request] request | tid="140302102822912" timestamp=1744369869 remote_addr="127.0.0.1" remote_port=42546 status=200 method="POST" path="/completion" params={}
- INFO [ update_slots] all slots are idle | tid="140708181868544" timestamp=1744369869
- INFO [ launch_slot_with_task] slot is processing task | tid="140708181868544" timestamp=1744369909 id_slot=0 id_task=272
- INFO [ update_slots] we have to evaluate at least 1 token to generate logits | tid="140708181868544" timestamp=1744369909 id_slot=0 id_task=272
- INFO [ update_slots] kv cache rm [p0, end) | tid="140708181868544" timestamp=1744369909 id_slot=0 id_task=272 p0=571
- INFO [ print_timings] prompt eval time = 525.53 ms / 1 tokens ( 525.53 ms per token, 1.90 tokens per second) | tid="140708181868544" timestamp=1744369983 id_slot=0 id_task=272 t_prompt_processing=525.532 n_prompt_tokens_processed=1 t_token=525.532 n_tokens_second=1.9028336999459594
- INFO [ print_timings] generation eval time = 73892.48 ms / 268 runs ( 275.72 ms per token, 3.63 tokens per second) | tid="140708181868544" timestamp=1744369983 id_slot=0 id_task=272 t_token_generation=73892.479 n_decoded=268 t_token=275.7182052238806 n_tokens_second=3.626891445880439
- INFO [ print_timings] total time = 74418.01 ms | tid="140708181868544" timestamp=1744369983 id_slot=0 id_task=272 t_prompt_processing=525.532 t_token_generation=73892.479 t_total=74418.01100000001
- INFO [ update_slots] slot released | tid="140708181868544" timestamp=1744369983 id_slot=0 id_task=272 n_ctx=81920 n_past=839 n_system_tokens=0 n_cache_tokens=839 truncated=false
- INFO [ update_slots] all slots are idle | tid="140708181868544" timestamp=1744369983
- INFO [ log_server_request] request | tid="140302050394112" timestamp=1744369983 remote_addr="127.0.0.1" remote_port=42550 status=200 method="POST" path="/completion" params={}
- INFO [ update_slots] all slots are idle | tid="140708181868544" timestamp=1744369983
- ^CINFO [ update_slots] all slots are idle | tid="140708181868544" timestamp=1744370065
Add Comment
Please, Sign In to add comment