Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- time=2025-07-19T17:01:40.559+02:00 level=INFO source=routes.go:1235 msg="server config" env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:4096 OLLAMA_DEBUG:INFO OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://0.0.0.0:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:C:\\Users\\Haldi\\.ollama\\models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NEW_ENGINE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://* vscode-file://*] OLLAMA_SCHED_SPREAD:false ROCR_VISIBLE_DEVICES:]"
- time=2025-07-19T17:01:40.560+02:00 level=INFO source=images.go:476 msg="total blobs: 0"
- time=2025-07-19T17:01:40.560+02:00 level=INFO source=images.go:483 msg="total unused blobs removed: 0"
- time=2025-07-19T17:01:40.561+02:00 level=INFO source=routes.go:1288 msg="Listening on [::]:11434 (version 0.9.6)"
- time=2025-07-19T17:01:40.561+02:00 level=INFO source=gpu.go:217 msg="looking for compatible GPUs"
- time=2025-07-19T17:01:40.561+02:00 level=INFO source=gpu_windows.go:167 msg=packages count=1
- time=2025-07-19T17:01:40.561+02:00 level=INFO source=gpu_windows.go:214 msg="" package=0 cores=16 efficiency=0 threads=32
- time=2025-07-19T17:01:41.144+02:00 level=INFO source=amd_windows.go:127 msg="unsupported Radeon iGPU detected skipping" id=0 total="18.0 GiB"
- time=2025-07-19T17:01:41.146+02:00 level=INFO source=types.go:130 msg="inference compute" id=GPU-ebed2943-db15-05b0-424c-c3b47e1679a0 library=cuda variant=v12 compute=8.9 driver=12.8 name="NVIDIA GeForce RTX 4090" total="24.0 GiB" available="22.5 GiB"
- [GIN] 2025/07/19 - 17:02:25 | 200 | 0s | 127.0.0.1 | GET "/"
- [GIN] 2025/07/19 - 17:02:25 | 200 | 0s | 127.0.0.1 | GET "/"
- [GIN] 2025/07/19 - 17:02:25 | 404 | 0s | 127.0.0.1 | GET "/favicon.ico"
- [GIN] 2025/07/19 - 17:18:53 | 200 | 0s | 192.168.1.2 | GET "/api/version"
- [GIN] 2025/07/19 - 17:18:58 | 200 | 529.5µs | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 17:18:58 | 200 | 0s | 192.168.1.2 | GET "/api/ps"
- [GIN] 2025/07/19 - 17:18:59 | 200 | 0s | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 17:18:59 | 200 | 0s | 192.168.1.2 | GET "/api/ps"
- [GIN] 2025/07/19 - 17:19:04 | 200 | 0s | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 17:19:11 | 400 | 0s | 192.168.1.2 | POST "/api/pull"
- time=2025-07-19T17:24:16.826+02:00 level=INFO source=download.go:177 msg="downloading f2dc41fa964b in 27 1 GB part(s)"
- time=2025-07-19T17:28:41.677+02:00 level=INFO source=download.go:177 msg="downloading 53d74de0d84c in 1 84 B part(s)"
- time=2025-07-19T17:28:43.099+02:00 level=INFO source=download.go:177 msg="downloading 43070e2d4e53 in 1 11 KB part(s)"
- time=2025-07-19T17:28:44.471+02:00 level=INFO source=download.go:177 msg="downloading ed11eda7790d in 1 30 B part(s)"
- time=2025-07-19T17:28:45.826+02:00 level=INFO source=download.go:177 msg="downloading deae14c19dac in 1 486 B part(s)"
- [GIN] 2025/07/19 - 17:29:10 | 200 | 0s | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 17:29:11 | 200 | 4m56s | 192.168.1.2 | POST "/api/pull"
- [GIN] 2025/07/19 - 17:29:45 | 200 | 521.4µs | 192.168.1.2 | GET "/api/version"
- [GIN] 2025/07/19 - 17:29:46 | 200 | 548.3µs | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 17:29:46 | 200 | 0s | 192.168.1.2 | GET "/api/ps"
- time=2025-07-19T17:30:01.377+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="29.6 GiB" free_swap="29.1 GiB"
- time=2025-07-19T17:30:01.378+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=22 layers.split="" memory.available="[19.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="27.1 GiB" memory.required.partial="19.2 GiB" memory.required.kv="512.0 MiB" memory.required.allocations="[19.2 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="296.0 MiB" memory.graph.partial="830.0 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T17:30:01.412+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 4096 --batch-size 512 --n-gpu-layers 22 --threads 16 --no-mmap --parallel 1 --port 51719"
- time=2025-07-19T17:30:01.416+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T17:30:01.416+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T17:30:01.416+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T17:30:01.447+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T17:30:09.855+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T17:30:09.856+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:51719"
- time=2025-07-19T17:30:09.934+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- load_tensors: offloading 22 repeating layers to GPU
- load_tensors: offloaded 22/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 7999.43 MiB
- load_tensors: CUDA0 model buffer size = 17218.44 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 4096
- llama_context: n_ctx_per_seq = 4096
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (4096) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 4096, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 352.00 MiB
- llama_kv_cache_unified: CPU KV buffer size = 160.00 MiB
- llama_kv_cache_unified: KV self size = 512.00 MiB, K (f16): 256.00 MiB, V (f16): 256.00 MiB
- llama_context: CUDA0 compute buffer size = 397.00 MiB
- llama_context: CUDA_Host compute buffer size = 16.01 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 124 (with bs=512), 3 (with bs=1)
- time=2025-07-19T17:30:25.218+02:00 level=INFO source=server.go:637 msg="llama runner started in 23.80 seconds"
- [GIN] 2025/07/19 - 17:30:26 | 200 | 25.6488028s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:30:31 | 200 | 4.0439405s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:30:32 | 200 | 1.2118845s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:30:34 | 200 | 2.4645466s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:31:11 | 200 | 18.1656705s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:31:16 | 200 | 5.6065391s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:31:36 | 200 | 11.1392618s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:31:42 | 200 | 5.6420976s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:32:17 | 200 | 12.0526902s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:32:22 | 200 | 5.281443s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:36:05 | 200 | 0s | 127.0.0.1 | GET "/"
- [GIN] 2025/07/19 - 17:36:05 | 200 | 0s | 127.0.0.1 | GET "/"
- [GIN] 2025/07/19 - 17:36:05 | 404 | 0s | 127.0.0.1 | GET "/favicon.ico"
- [GIN] 2025/07/19 - 17:36:19 | 200 | 520.4µs | 192.168.1.1 | HEAD "/"
- [GIN] 2025/07/19 - 17:36:38 | 200 | 0s | 192.168.1.1 | GET "/"
- [GIN] 2025/07/19 - 17:36:47 | 200 | 0s | 192.168.1.200 | GET "/"
- [GIN] 2025/07/19 - 17:36:47 | 404 | 0s | 192.168.1.200 | GET "/favicon.ico"
- time=2025-07-19T17:38:10.309+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.1 GiB" free_swap="30.2 GiB"
- time=2025-07-19T17:38:10.310+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=22 layers.split="" memory.available="[19.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="27.1 GiB" memory.required.partial="19.2 GiB" memory.required.kv="512.0 MiB" memory.required.allocations="[19.2 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="296.0 MiB" memory.graph.partial="830.0 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T17:38:10.332+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 4096 --batch-size 512 --n-gpu-layers 22 --threads 16 --no-mmap --parallel 1 --port 52038"
- time=2025-07-19T17:38:10.334+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T17:38:10.334+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T17:38:10.335+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T17:38:10.378+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T17:38:10.453+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T17:38:10.454+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:52038"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- time=2025-07-19T17:38:10.586+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- load_tensors: offloading 22 repeating layers to GPU
- load_tensors: offloaded 22/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 7999.43 MiB
- load_tensors: CUDA0 model buffer size = 17218.44 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 4096
- llama_context: n_ctx_per_seq = 4096
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (4096) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 4096, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 352.00 MiB
- llama_kv_cache_unified: CPU KV buffer size = 160.00 MiB
- llama_kv_cache_unified: KV self size = 512.00 MiB, K (f16): 256.00 MiB, V (f16): 256.00 MiB
- llama_context: CUDA0 compute buffer size = 397.00 MiB
- llama_context: CUDA_Host compute buffer size = 16.01 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 124 (with bs=512), 3 (with bs=1)
- time=2025-07-19T17:38:25.868+02:00 level=INFO source=server.go:637 msg="llama runner started in 15.53 seconds"
- [GIN] 2025/07/19 - 17:38:31 | 200 | 21.6694484s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:38:45 | 200 | 13.088288s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:38:50 | 200 | 5.1806868s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:39:57 | 200 | 2.9611066s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:40:08 | 200 | 11.6257541s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:40:15 | 200 | 6.8390283s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:40:53 | 200 | 2.8219905s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:41:01 | 200 | 7.9993163s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:41:06 | 200 | 4.4385367s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:41:08 | 200 | 1.0207ms | 192.168.1.1 | GET "/"
- [GIN] 2025/07/19 - 17:41:30 | 200 | 2.7923974s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:41:48 | 200 | 17.8480399s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:41:53 | 200 | 5.5996437s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:42:17 | 200 | 2.7939619s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:42:30 | 200 | 12.5119819s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:42:35 | 200 | 5.0069482s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:43:36 | 200 | 0s | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 17:43:36 | 200 | 0s | 192.168.1.2 | GET "/api/ps"
- [GIN] 2025/07/19 - 17:43:37 | 200 | 0s | 192.168.1.2 | GET "/api/version"
- [GIN] 2025/07/19 - 17:43:38 | 200 | 524.1µs | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 17:43:38 | 200 | 0s | 192.168.1.2 | GET "/api/ps"
- [GIN] 2025/07/19 - 17:43:39 | 200 | 521.3µs | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 17:43:39 | 200 | 0s | 192.168.1.2 | GET "/api/ps"
- [GIN] 2025/07/19 - 17:43:47 | 200 | 0s | 192.168.1.2 | GET "/api/version"
- [GIN] 2025/07/19 - 17:43:48 | 200 | 522µs | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 17:43:48 | 200 | 0s | 192.168.1.2 | GET "/api/ps"
- [GIN] 2025/07/19 - 17:43:49 | 200 | 531.7µs | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 17:43:49 | 200 | 0s | 192.168.1.2 | GET "/api/ps"
- [GIN] 2025/07/19 - 17:43:59 | 200 | 0s | 192.168.1.2 | GET "/api/version"
- [GIN] 2025/07/19 - 17:44:29 | 200 | 522.9µs | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 17:44:29 | 200 | 0s | 192.168.1.2 | GET "/api/ps"
- [GIN] 2025/07/19 - 17:45:49 | 200 | 32.1349125s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:45:56 | 200 | 6.3929667s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:45:59 | 200 | 2.6024057s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:46:01 | 200 | 2.8657871s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:46:06 | 200 | 3.824311s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:46:08 | 200 | 0s | 192.168.1.1 | GET "/"
- [GIN] 2025/07/19 - 17:46:39 | 200 | 33.1099085s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:46:50 | 200 | 11.3300074s | 192.168.1.2 | POST "/api/chat"
- [GIN] 2025/07/19 - 17:47:07 | 200 | 524.3µs | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 17:47:07 | 200 | 0s | 192.168.1.2 | GET "/api/ps"
- [GIN] 2025/07/19 - 17:47:10 | 200 | 1.0018ms | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 17:47:10 | 200 | 0s | 192.168.1.2 | GET "/api/ps"
- [GIN] 2025/07/19 - 17:47:10 | 200 | 2.6068ms | 192.168.1.2 | POST "/api/generate"
- [GIN] 2025/07/19 - 17:47:10 | 200 | 518.4µs | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 17:47:10 | 200 | 0s | 192.168.1.2 | GET "/api/ps"
- time=2025-07-19T17:47:18.676+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.5 GiB" free_swap="29.3 GiB"
- time=2025-07-19T17:47:18.676+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=21 layers.split="" memory.available="[18.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="27.1 GiB" memory.required.partial="18.4 GiB" memory.required.kv="512.0 MiB" memory.required.allocations="[18.4 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="296.0 MiB" memory.graph.partial="830.0 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T17:47:18.699+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 4096 --batch-size 512 --n-gpu-layers 21 --threads 16 --no-mmap --parallel 1 --port 52520"
- time=2025-07-19T17:47:18.701+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T17:47:18.701+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T17:47:18.702+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T17:47:18.746+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T17:47:18.822+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T17:47:18.822+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:52520"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- time=2025-07-19T17:47:18.952+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load_tensors: offloading 21 repeating layers to GPU
- load_tensors: offloaded 21/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 8782.09 MiB
- load_tensors: CUDA0 model buffer size = 16435.78 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 4096
- llama_context: n_ctx_per_seq = 4096
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (4096) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 4096, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 336.00 MiB
- llama_kv_cache_unified: CPU KV buffer size = 176.00 MiB
- llama_kv_cache_unified: KV self size = 512.00 MiB, K (f16): 256.00 MiB, V (f16): 256.00 MiB
- llama_context: CUDA0 compute buffer size = 397.00 MiB
- llama_context: CUDA_Host compute buffer size = 16.01 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 136 (with bs=512), 3 (with bs=1)
- time=2025-07-19T17:47:34.234+02:00 level=INFO source=server.go:637 msg="llama runner started in 15.53 seconds"
- [GIN] 2025/07/19 - 17:47:44 | 200 | 25.4584598s | 83.77.231.178 | POST "/api/generate"
- [GIN] 2025/07/19 - 17:47:48 | 200 | 4.2552416s | 83.77.231.178 | POST "/api/generate"
- [GIN] 2025/07/19 - 17:47:58 | 200 | 2.8626601s | 83.77.231.178 | POST "/api/generate"
- time=2025-07-19T17:48:02.545+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.7 GiB" free_swap="29.5 GiB"
- time=2025-07-19T17:48:02.546+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=21 layers.split="" memory.available="[18.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.9 GiB" memory.required.partial="18.3 GiB" memory.required.kv="294.5 MiB" memory.required.allocations="[18.3 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="183.9 MiB" memory.graph.partial="826.6 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T17:48:02.579+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2356 --batch-size 512 --n-gpu-layers 21 --threads 16 --no-mmap --parallel 1 --port 52531"
- time=2025-07-19T17:48:02.583+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T17:48:02.583+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T17:48:02.584+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T17:48:02.665+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T17:48:02.746+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T17:48:02.747+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:52531"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- time=2025-07-19T17:48:02.835+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- load_tensors: offloading 21 repeating layers to GPU
- load_tensors: offloaded 21/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 8782.09 MiB
- load_tensors: CUDA0 model buffer size = 16435.78 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2356
- llama_context: n_ctx_per_seq = 2356
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2356) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2368, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 194.25 MiB
- llama_kv_cache_unified: CPU KV buffer size = 101.75 MiB
- llama_kv_cache_unified: KV self size = 296.00 MiB, K (f16): 148.00 MiB, V (f16): 148.00 MiB
- llama_context: CUDA0 compute buffer size = 393.63 MiB
- llama_context: CUDA_Host compute buffer size = 12.63 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 136 (with bs=512), 3 (with bs=1)
- time=2025-07-19T17:48:18.367+02:00 level=INFO source=server.go:637 msg="llama runner started in 15.78 seconds"
- time=2025-07-19T17:48:18.374+02:00 level=WARN source=runner.go:128 msg="truncating input prompt" limit=2356 prompt=2713 keep=5 new=2356
- [GIN] 2025/07/19 - 17:48:35 | 200 | 33.2707893s | 83.77.231.178 | POST "/api/generate"
- time=2025-07-19T17:48:38.930+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.6 GiB" free_swap="29.3 GiB"
- time=2025-07-19T17:48:38.930+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=21 layers.split="" memory.available="[18.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.8 GiB" memory.required.partial="18.2 GiB" memory.required.kv="256.0 MiB" memory.required.allocations="[18.2 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="826.0 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T17:48:38.955+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 1849 --batch-size 512 --n-gpu-layers 21 --threads 16 --no-mmap --parallel 1 --port 52542"
- time=2025-07-19T17:48:38.958+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T17:48:38.958+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T17:48:38.958+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T17:48:39.004+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T17:48:39.081+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T17:48:39.081+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:52542"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- time=2025-07-19T17:48:39.208+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load_tensors: offloading 21 repeating layers to GPU
- load_tensors: offloaded 21/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 8782.09 MiB
- load_tensors: CUDA0 model buffer size = 16435.78 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 1849
- llama_context: n_ctx_per_seq = 1849
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (1849) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 1856, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 152.25 MiB
- llama_kv_cache_unified: CPU KV buffer size = 79.75 MiB
- llama_kv_cache_unified: KV self size = 232.00 MiB, K (f16): 116.00 MiB, V (f16): 116.00 MiB
- llama_context: CUDA0 compute buffer size = 405.00 MiB
- llama_context: CUDA_Host compute buffer size = 11.63 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 136 (with bs=512), 3 (with bs=1)
- time=2025-07-19T17:48:54.738+02:00 level=INFO source=server.go:637 msg="llama runner started in 15.78 seconds"
- [GIN] 2025/07/19 - 17:49:01 | 200 | 23.5846212s | 83.77.231.178 | POST "/api/generate"
- time=2025-07-19T17:49:06.396+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.6 GiB" free_swap="29.4 GiB"
- time=2025-07-19T17:49:06.396+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=21 layers.split="" memory.available="[18.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.9 GiB" memory.required.partial="18.3 GiB" memory.required.kv="300.9 MiB" memory.required.allocations="[18.3 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="187.1 MiB" memory.graph.partial="826.7 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T17:49:06.421+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2407 --batch-size 512 --n-gpu-layers 21 --threads 16 --no-mmap --parallel 1 --port 52548"
- time=2025-07-19T17:49:06.424+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T17:49:06.424+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T17:49:06.424+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T17:49:06.466+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T17:49:06.580+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T17:49:06.580+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:52548"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- time=2025-07-19T17:49:06.675+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- load_tensors: offloading 21 repeating layers to GPU
- load_tensors: offloaded 21/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 8782.09 MiB
- load_tensors: CUDA0 model buffer size = 16435.78 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2407
- llama_context: n_ctx_per_seq = 2407
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2407) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2432, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 199.50 MiB
- llama_kv_cache_unified: CPU KV buffer size = 104.50 MiB
- llama_kv_cache_unified: KV self size = 304.00 MiB, K (f16): 152.00 MiB, V (f16): 152.00 MiB
- llama_context: CUDA0 compute buffer size = 393.75 MiB
- llama_context: CUDA_Host compute buffer size = 12.76 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 136 (with bs=512), 3 (with bs=1)
- time=2025-07-19T17:49:22.204+02:00 level=INFO source=server.go:637 msg="llama runner started in 15.78 seconds"
- time=2025-07-19T17:49:22.208+02:00 level=WARN source=runner.go:128 msg="truncating input prompt" limit=2407 prompt=2909 keep=5 new=2407
- [GIN] 2025/07/19 - 17:49:36 | 200 | 30.7193165s | 83.77.231.178 | POST "/api/generate"
- time=2025-07-19T17:49:40.715+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.5 GiB" free_swap="29.4 GiB"
- time=2025-07-19T17:49:40.715+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=21 layers.split="" memory.available="[18.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.9 GiB" memory.required.partial="18.3 GiB" memory.required.kv="274.4 MiB" memory.required.allocations="[18.3 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="173.5 MiB" memory.graph.partial="826.3 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T17:49:40.739+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2195 --batch-size 512 --n-gpu-layers 21 --threads 16 --no-mmap --parallel 1 --port 52573"
- time=2025-07-19T17:49:40.741+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T17:49:40.741+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T17:49:40.741+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T17:49:40.817+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T17:49:40.893+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T17:49:40.894+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:52573"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- time=2025-07-19T17:49:40.992+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load_tensors: offloading 21 repeating layers to GPU
- load_tensors: offloaded 21/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 8782.09 MiB
- load_tensors: CUDA0 model buffer size = 16435.78 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2195
- llama_context: n_ctx_per_seq = 2195
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2195) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2208, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 181.12 MiB
- llama_kv_cache_unified: CPU KV buffer size = 94.88 MiB
- llama_kv_cache_unified: KV self size = 276.00 MiB, K (f16): 138.00 MiB, V (f16): 138.00 MiB
- llama_context: CUDA0 compute buffer size = 393.31 MiB
- llama_context: CUDA_Host compute buffer size = 12.32 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 136 (with bs=512), 3 (with bs=1)
- time=2025-07-19T17:49:56.527+02:00 level=INFO source=server.go:637 msg="llama runner started in 15.79 seconds"
- time=2025-07-19T17:49:56.532+02:00 level=WARN source=runner.go:128 msg="truncating input prompt" limit=2195 prompt=2315 keep=5 new=2195
- [GIN] 2025/07/19 - 17:50:05 | 200 | 25.1645168s | 83.77.231.178 | POST "/api/generate"
- time=2025-07-19T17:50:09.600+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.3 GiB" free_swap="29.3 GiB"
- time=2025-07-19T17:50:09.601+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=21 layers.split="" memory.available="[18.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.9 GiB" memory.required.partial="18.3 GiB" memory.required.kv="297.0 MiB" memory.required.allocations="[18.3 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="185.1 MiB" memory.graph.partial="826.7 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T17:50:09.626+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2376 --batch-size 512 --n-gpu-layers 21 --threads 16 --no-mmap --parallel 1 --port 52594"
- time=2025-07-19T17:50:09.629+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T17:50:09.629+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T17:50:09.629+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T17:50:09.670+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T17:50:09.764+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T17:50:09.765+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:52594"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- time=2025-07-19T17:50:09.880+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load_tensors: offloading 21 repeating layers to GPU
- load_tensors: offloaded 21/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 8782.09 MiB
- load_tensors: CUDA0 model buffer size = 16435.78 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2376
- llama_context: n_ctx_per_seq = 2376
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2376) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2400, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 196.88 MiB
- llama_kv_cache_unified: CPU KV buffer size = 103.12 MiB
- llama_kv_cache_unified: KV self size = 300.00 MiB, K (f16): 150.00 MiB, V (f16): 150.00 MiB
- llama_context: CUDA0 compute buffer size = 393.69 MiB
- llama_context: CUDA_Host compute buffer size = 12.69 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 136 (with bs=512), 3 (with bs=1)
- time=2025-07-19T17:50:25.662+02:00 level=INFO source=server.go:637 msg="llama runner started in 16.03 seconds"
- time=2025-07-19T17:50:25.666+02:00 level=WARN source=runner.go:128 msg="truncating input prompt" limit=2376 prompt=2825 keep=5 new=2376
- [GIN] 2025/07/19 - 17:50:42 | 200 | 33.7636898s | 83.77.231.178 | POST "/api/generate"
- time=2025-07-19T17:50:48.767+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.5 GiB" free_swap="29.6 GiB"
- time=2025-07-19T17:50:48.767+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=21 layers.split="" memory.available="[18.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.8 GiB" memory.required.partial="18.2 GiB" memory.required.kv="256.0 MiB" memory.required.allocations="[18.2 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="826.0 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T17:50:48.794+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2035 --batch-size 512 --n-gpu-layers 21 --threads 16 --no-mmap --parallel 1 --port 52671"
- time=2025-07-19T17:50:48.799+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T17:50:48.799+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T17:50:48.800+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T17:50:48.853+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T17:50:48.931+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T17:50:48.932+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:52671"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- time=2025-07-19T17:50:49.051+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load_tensors: offloading 21 repeating layers to GPU
- load_tensors: offloaded 21/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 8782.09 MiB
- load_tensors: CUDA0 model buffer size = 16435.78 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2035
- llama_context: n_ctx_per_seq = 2035
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2035) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2048, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 168.00 MiB
- llama_kv_cache_unified: CPU KV buffer size = 88.00 MiB
- llama_kv_cache_unified: KV self size = 256.00 MiB, K (f16): 128.00 MiB, V (f16): 128.00 MiB
- llama_context: CUDA0 compute buffer size = 405.00 MiB
- llama_context: CUDA_Host compute buffer size = 12.01 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 136 (with bs=512), 3 (with bs=1)
- time=2025-07-19T17:51:04.834+02:00 level=INFO source=server.go:637 msg="llama runner started in 16.03 seconds"
- time=2025-07-19T17:51:04.837+02:00 level=WARN source=runner.go:128 msg="truncating input prompt" limit=2035 prompt=2044 keep=5 new=2035
- [GIN] 2025/07/19 - 17:51:08 | 200 | 0s | 192.168.1.1 | GET "/"
- [GIN] 2025/07/19 - 17:51:15 | 200 | 27.1912518s | 83.77.231.178 | POST "/api/generate"
- time=2025-07-19T17:51:20.364+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.3 GiB" free_swap="28.9 GiB"
- time=2025-07-19T17:51:20.364+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=21 layers.split="" memory.available="[18.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.8 GiB" memory.required.partial="18.2 GiB" memory.required.kv="256.0 MiB" memory.required.allocations="[18.2 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="826.0 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T17:51:20.393+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 1879 --batch-size 512 --n-gpu-layers 21 --threads 16 --no-mmap --parallel 1 --port 52736"
- time=2025-07-19T17:51:20.399+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T17:51:20.399+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T17:51:20.399+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T17:51:20.441+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T17:51:20.517+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T17:51:20.517+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:52736"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- time=2025-07-19T17:51:20.650+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load_tensors: offloading 21 repeating layers to GPU
- load_tensors: offloaded 21/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 8782.09 MiB
- load_tensors: CUDA0 model buffer size = 16435.78 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 1879
- llama_context: n_ctx_per_seq = 1879
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (1879) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 1888, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 154.88 MiB
- llama_kv_cache_unified: CPU KV buffer size = 81.12 MiB
- llama_kv_cache_unified: KV self size = 236.00 MiB, K (f16): 118.00 MiB, V (f16): 118.00 MiB
- llama_context: CUDA0 compute buffer size = 405.00 MiB
- llama_context: CUDA_Host compute buffer size = 11.69 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 136 (with bs=512), 3 (with bs=1)
- time=2025-07-19T17:51:36.181+02:00 level=INFO source=server.go:637 msg="llama runner started in 15.78 seconds"
- [GIN] 2025/07/19 - 17:51:46 | 200 | 26.5624173s | 83.77.231.178 | POST "/api/generate"
- [GIN] 2025/07/19 - 17:51:46 | 200 | 0s | 83.77.231.178 | GET "/api/ps"
- time=2025-07-19T17:51:50.753+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.2 GiB" free_swap="28.9 GiB"
- time=2025-07-19T17:51:50.754+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=21 layers.split="" memory.available="[18.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.8 GiB" memory.required.partial="18.2 GiB" memory.required.kv="260.4 MiB" memory.required.allocations="[18.2 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="166.3 MiB" memory.graph.partial="826.1 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T17:51:50.778+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2083 --batch-size 512 --n-gpu-layers 21 --threads 16 --no-mmap --parallel 1 --port 52782"
- time=2025-07-19T17:51:50.781+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T17:51:50.781+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T17:51:50.781+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T17:51:50.817+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T17:51:50.900+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T17:51:50.900+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:52782"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- time=2025-07-19T17:51:51.033+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load_tensors: offloading 21 repeating layers to GPU
- load_tensors: offloaded 21/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 8782.09 MiB
- load_tensors: CUDA0 model buffer size = 16435.78 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2083
- llama_context: n_ctx_per_seq = 2083
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2083) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2112, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 173.25 MiB
- llama_kv_cache_unified: CPU KV buffer size = 90.75 MiB
- llama_kv_cache_unified: KV self size = 264.00 MiB, K (f16): 132.00 MiB, V (f16): 132.00 MiB
- llama_context: CUDA0 compute buffer size = 393.13 MiB
- llama_context: CUDA_Host compute buffer size = 12.13 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 136 (with bs=512), 3 (with bs=1)
- time=2025-07-19T17:52:06.565+02:00 level=INFO source=server.go:637 msg="llama runner started in 15.78 seconds"
- [GIN] 2025/07/19 - 17:52:15 | 200 | 25.1089685s | 83.77.231.178 | POST "/api/generate"
- time=2025-07-19T17:52:19.894+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.4 GiB" free_swap="29.2 GiB"
- time=2025-07-19T17:52:19.895+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=21 layers.split="" memory.available="[18.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.9 GiB" memory.required.partial="18.3 GiB" memory.required.kv="277.5 MiB" memory.required.allocations="[18.3 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="175.1 MiB" memory.graph.partial="826.4 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T17:52:19.917+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2220 --batch-size 512 --n-gpu-layers 21 --threads 16 --no-mmap --parallel 1 --port 52810"
- time=2025-07-19T17:52:19.920+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T17:52:19.921+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T17:52:19.921+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T17:52:19.964+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T17:52:20.047+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T17:52:20.047+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:52810"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- time=2025-07-19T17:52:20.172+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load_tensors: offloading 21 repeating layers to GPU
- load_tensors: offloaded 21/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 8782.09 MiB
- load_tensors: CUDA0 model buffer size = 16435.78 MiB
- [GIN] 2025/07/19 - 17:52:25 | 200 | 0s | 83.77.231.178 | GET "/api/ps"
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2220
- llama_context: n_ctx_per_seq = 2220
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2220) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2240, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 183.75 MiB
- llama_kv_cache_unified: CPU KV buffer size = 96.25 MiB
- llama_kv_cache_unified: KV self size = 280.00 MiB, K (f16): 140.00 MiB, V (f16): 140.00 MiB
- llama_context: CUDA0 compute buffer size = 393.38 MiB
- llama_context: CUDA_Host compute buffer size = 12.38 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 136 (with bs=512), 3 (with bs=1)
- time=2025-07-19T17:52:35.956+02:00 level=INFO source=server.go:637 msg="llama runner started in 16.04 seconds"
- time=2025-07-19T17:52:35.961+02:00 level=WARN source=runner.go:128 msg="truncating input prompt" limit=2220 prompt=2311 keep=5 new=2220
- [GIN] 2025/07/19 - 17:52:44 | 200 | 25.2718447s | 83.77.231.178 | POST "/api/generate"
- time=2025-07-19T17:52:49.080+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.3 GiB" free_swap="29.1 GiB"
- time=2025-07-19T17:52:49.080+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=21 layers.split="" memory.available="[18.6 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.9 GiB" memory.required.partial="18.3 GiB" memory.required.kv="272.5 MiB" memory.required.allocations="[18.3 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="172.5 MiB" memory.graph.partial="826.3 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T17:52:49.106+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2180 --batch-size 512 --n-gpu-layers 21 --threads 16 --no-mmap --parallel 1 --port 52922"
- time=2025-07-19T17:52:49.108+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T17:52:49.108+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T17:52:49.109+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T17:52:49.148+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T17:52:49.231+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T17:52:49.231+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:52922"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- time=2025-07-19T17:52:49.360+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load_tensors: offloading 21 repeating layers to GPU
- load_tensors: offloaded 21/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 8782.09 MiB
- load_tensors: CUDA0 model buffer size = 16435.78 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2180
- llama_context: n_ctx_per_seq = 2180
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2180) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2208, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 181.12 MiB
- llama_kv_cache_unified: CPU KV buffer size = 94.88 MiB
- llama_kv_cache_unified: KV self size = 276.00 MiB, K (f16): 138.00 MiB, V (f16): 138.00 MiB
- llama_context: CUDA0 compute buffer size = 393.31 MiB
- llama_context: CUDA_Host compute buffer size = 12.32 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 136 (with bs=512), 3 (with bs=1)
- time=2025-07-19T17:53:05.143+02:00 level=INFO source=server.go:637 msg="llama runner started in 16.03 seconds"
- time=2025-07-19T17:53:05.147+02:00 level=WARN source=runner.go:128 msg="truncating input prompt" limit=2180 prompt=2279 keep=5 new=2180
- [GIN] 2025/07/19 - 17:53:21 | 200 | 33.5639499s | 83.77.231.178 | POST "/api/generate"
- time=2025-07-19T17:53:25.838+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.6 GiB" free_swap="29.5 GiB"
- time=2025-07-19T17:53:25.839+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=21 layers.split="" memory.available="[18.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="27.0 GiB" memory.required.partial="18.4 GiB" memory.required.kv="460.1 MiB" memory.required.allocations="[18.4 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="269.3 MiB" memory.graph.partial="829.2 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T17:53:25.864+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 3681 --batch-size 512 --n-gpu-layers 21 --threads 16 --no-mmap --parallel 1 --port 52946"
- time=2025-07-19T17:53:25.866+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T17:53:25.866+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T17:53:25.866+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T17:53:25.915+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T17:53:25.996+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T17:53:25.997+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:52946"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- time=2025-07-19T17:53:26.118+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load_tensors: offloading 21 repeating layers to GPU
- load_tensors: offloaded 21/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 8782.09 MiB
- load_tensors: CUDA0 model buffer size = 16435.78 MiB
- [GIN] 2025/07/19 - 17:53:30 | 200 | 0s | 83.77.231.178 | GET "/"
- [GIN] 2025/07/19 - 17:53:30 | 404 | 0s | 83.77.231.178 | GET "/favicon.ico"
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 3681
- llama_context: n_ctx_per_seq = 3681
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (3681) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 3712, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 304.50 MiB
- llama_kv_cache_unified: CPU KV buffer size = 159.50 MiB
- llama_kv_cache_unified: KV self size = 464.00 MiB, K (f16): 232.00 MiB, V (f16): 232.00 MiB
- llama_context: CUDA0 compute buffer size = 396.25 MiB
- llama_context: CUDA_Host compute buffer size = 15.26 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 136 (with bs=512), 3 (with bs=1)
- time=2025-07-19T17:53:41.650+02:00 level=INFO source=server.go:637 msg="llama runner started in 15.78 seconds"
- time=2025-07-19T17:53:41.655+02:00 level=WARN source=runner.go:128 msg="truncating input prompt" limit=3681 prompt=7678 keep=5 new=3681
- [GIN] 2025/07/19 - 17:53:51 | 200 | 26.5249814s | 83.77.231.178 | POST "/api/generate"
- time=2025-07-19T17:53:56.175+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.5 GiB" free_swap="29.5 GiB"
- time=2025-07-19T17:53:56.175+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=21 layers.split="" memory.available="[18.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.8 GiB" memory.required.partial="18.2 GiB" memory.required.kv="260.5 MiB" memory.required.allocations="[18.2 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="166.3 MiB" memory.graph.partial="826.1 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T17:53:56.197+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2084 --batch-size 512 --n-gpu-layers 21 --threads 16 --no-mmap --parallel 1 --port 52982"
- time=2025-07-19T17:53:56.201+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T17:53:56.201+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T17:53:56.201+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T17:53:56.240+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T17:53:56.321+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T17:53:56.322+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:52982"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- time=2025-07-19T17:53:56.452+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load_tensors: offloading 21 repeating layers to GPU
- load_tensors: offloaded 21/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 8782.09 MiB
- load_tensors: CUDA0 model buffer size = 16435.78 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2084
- llama_context: n_ctx_per_seq = 2084
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2084) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2112, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 173.25 MiB
- llama_kv_cache_unified: CPU KV buffer size = 90.75 MiB
- llama_kv_cache_unified: KV self size = 264.00 MiB, K (f16): 132.00 MiB, V (f16): 132.00 MiB
- llama_context: CUDA0 compute buffer size = 393.13 MiB
- llama_context: CUDA_Host compute buffer size = 12.13 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 136 (with bs=512), 3 (with bs=1)
- time=2025-07-19T17:54:11.983+02:00 level=INFO source=server.go:637 msg="llama runner started in 15.78 seconds"
- [GIN] 2025/07/19 - 17:54:20 | 200 | 25.2196376s | 83.77.231.178 | POST "/api/generate"
- time=2025-07-19T17:54:24.889+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.6 GiB" free_swap="29.4 GiB"
- time=2025-07-19T17:54:24.889+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=21 layers.split="" memory.available="[18.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.9 GiB" memory.required.partial="18.3 GiB" memory.required.kv="316.5 MiB" memory.required.allocations="[18.3 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="195.2 MiB" memory.graph.partial="827.0 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T17:54:24.914+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2532 --batch-size 512 --n-gpu-layers 21 --threads 16 --no-mmap --parallel 1 --port 52986"
- time=2025-07-19T17:54:24.917+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T17:54:24.917+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T17:54:24.917+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T17:54:24.964+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T17:54:25.070+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T17:54:25.070+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:52986"
- time=2025-07-19T17:54:25.168+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- load_tensors: offloading 21 repeating layers to GPU
- load_tensors: offloaded 21/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 8782.09 MiB
- load_tensors: CUDA0 model buffer size = 16435.78 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2532
- llama_context: n_ctx_per_seq = 2532
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2532) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2560, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 210.00 MiB
- llama_kv_cache_unified: CPU KV buffer size = 110.00 MiB
- llama_kv_cache_unified: KV self size = 320.00 MiB, K (f16): 160.00 MiB, V (f16): 160.00 MiB
- llama_context: CUDA0 compute buffer size = 394.00 MiB
- llama_context: CUDA_Host compute buffer size = 13.01 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 136 (with bs=512), 3 (with bs=1)
- time=2025-07-19T17:54:40.954+02:00 level=INFO source=server.go:637 msg="llama runner started in 16.04 seconds"
- time=2025-07-19T17:54:40.959+02:00 level=WARN source=runner.go:128 msg="truncating input prompt" limit=2532 prompt=3176 keep=5 new=2532
- [GIN] 2025/07/19 - 17:54:53 | 200 | 28.9310929s | 83.77.231.178 | POST "/api/generate"
- time=2025-07-19T17:54:57.959+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.4 GiB" free_swap="29.5 GiB"
- time=2025-07-19T17:54:57.960+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=21 layers.split="" memory.available="[18.7 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.8 GiB" memory.required.partial="18.2 GiB" memory.required.kv="260.6 MiB" memory.required.allocations="[18.2 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="166.4 MiB" memory.graph.partial="826.1 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T17:54:57.984+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2085 --batch-size 512 --n-gpu-layers 21 --threads 16 --no-mmap --parallel 1 --port 52996"
- time=2025-07-19T17:54:57.987+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T17:54:57.987+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T17:54:57.987+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T17:54:58.028+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T17:54:58.111+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T17:54:58.111+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:52996"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- time=2025-07-19T17:54:58.238+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load_tensors: offloading 21 repeating layers to GPU
- load_tensors: offloaded 21/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 8782.09 MiB
- load_tensors: CUDA0 model buffer size = 16435.78 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2085
- llama_context: n_ctx_per_seq = 2085
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2085) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2112, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 173.25 MiB
- llama_kv_cache_unified: CPU KV buffer size = 90.75 MiB
- llama_kv_cache_unified: KV self size = 264.00 MiB, K (f16): 132.00 MiB, V (f16): 132.00 MiB
- llama_context: CUDA0 compute buffer size = 393.13 MiB
- llama_context: CUDA_Host compute buffer size = 12.13 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 136 (with bs=512), 3 (with bs=1)
- time=2025-07-19T17:55:14.022+02:00 level=INFO source=server.go:637 msg="llama runner started in 16.04 seconds"
- [GIN] 2025/07/19 - 17:55:23 | 200 | 26.3996882s | 83.77.231.178 | POST "/api/generate"
- [GIN] 2025/07/19 - 17:56:08 | 200 | 0s | 192.168.1.1 | GET "/"
- [GIN] 2025/07/19 - 18:01:08 | 200 | 0s | 192.168.1.1 | GET "/"
- [GIN] 2025/07/19 - 18:06:08 | 200 | 0s | 192.168.1.1 | GET "/"
- time=2025-07-19T18:07:46.341+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.6 GiB" free_swap="31.0 GiB"
- time=2025-07-19T18:07:46.342+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=23 layers.split="" memory.available="[20.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.8 GiB" memory.required.partial="19.8 GiB" memory.required.kv="260.6 MiB" memory.required.allocations="[19.8 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="166.4 MiB" memory.graph.partial="826.1 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T18:07:46.365+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2085 --batch-size 512 --n-gpu-layers 23 --threads 16 --no-mmap --parallel 1 --port 53414"
- time=2025-07-19T18:07:46.368+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T18:07:46.368+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T18:07:46.368+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T18:07:46.402+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T18:07:46.478+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T18:07:46.479+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:53414"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- time=2025-07-19T18:07:46.618+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load_tensors: offloading 23 repeating layers to GPU
- load_tensors: offloaded 23/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 7216.77 MiB
- load_tensors: CUDA0 model buffer size = 18001.09 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2085
- llama_context: n_ctx_per_seq = 2085
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2085) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2112, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 189.75 MiB
- llama_kv_cache_unified: CPU KV buffer size = 74.25 MiB
- llama_kv_cache_unified: KV self size = 264.00 MiB, K (f16): 132.00 MiB, V (f16): 132.00 MiB
- llama_context: CUDA0 compute buffer size = 393.13 MiB
- llama_context: CUDA_Host compute buffer size = 12.13 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 112 (with bs=512), 3 (with bs=1)
- time=2025-07-19T18:07:52.881+02:00 level=INFO source=server.go:637 msg="llama runner started in 6.51 seconds"
- [GIN] 2025/07/19 - 18:08:00 | 200 | 14.3169454s | 100.107.36.63 | POST "/api/generate"
- time=2025-07-19T18:08:25.644+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.4 GiB" free_swap="31.2 GiB"
- time=2025-07-19T18:08:25.645+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=23 layers.split="" memory.available="[20.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.9 GiB" memory.required.partial="19.8 GiB" memory.required.kv="275.8 MiB" memory.required.allocations="[19.8 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="174.2 MiB" memory.graph.partial="826.3 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T18:08:25.669+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2206 --batch-size 512 --n-gpu-layers 23 --threads 16 --no-mmap --parallel 1 --port 53424"
- time=2025-07-19T18:08:25.672+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T18:08:25.672+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T18:08:25.673+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T18:08:25.725+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T18:08:25.801+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T18:08:25.802+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:53424"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- time=2025-07-19T18:08:25.923+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- load_tensors: offloading 23 repeating layers to GPU
- load_tensors: offloaded 23/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 7216.77 MiB
- load_tensors: CUDA0 model buffer size = 18001.09 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2206
- llama_context: n_ctx_per_seq = 2206
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2206) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2208, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 198.38 MiB
- llama_kv_cache_unified: CPU KV buffer size = 77.62 MiB
- llama_kv_cache_unified: KV self size = 276.00 MiB, K (f16): 138.00 MiB, V (f16): 138.00 MiB
- llama_context: CUDA0 compute buffer size = 393.31 MiB
- llama_context: CUDA_Host compute buffer size = 12.32 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 112 (with bs=512), 3 (with bs=1)
- time=2025-07-19T18:08:30.181+02:00 level=INFO source=server.go:637 msg="llama runner started in 4.51 seconds"
- time=2025-07-19T18:08:30.185+02:00 level=WARN source=runner.go:128 msg="truncating input prompt" limit=2206 prompt=2254 keep=5 new=2206
- [GIN] 2025/07/19 - 18:08:36 | 200 | 12.0389404s | 100.107.36.63 | POST "/api/generate"
- time=2025-07-19T18:09:01.925+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.5 GiB" free_swap="31.2 GiB"
- time=2025-07-19T18:09:01.925+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=23 layers.split="" memory.available="[20.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.9 GiB" memory.required.partial="19.8 GiB" memory.required.kv="315.1 MiB" memory.required.allocations="[19.8 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="194.5 MiB" memory.graph.partial="826.9 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T18:09:01.949+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2521 --batch-size 512 --n-gpu-layers 23 --threads 16 --no-mmap --parallel 1 --port 53432"
- time=2025-07-19T18:09:01.952+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T18:09:01.952+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T18:09:01.952+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T18:09:02.007+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T18:09:02.093+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T18:09:02.094+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:53432"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- time=2025-07-19T18:09:02.203+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- load_tensors: offloading 23 repeating layers to GPU
- load_tensors: offloaded 23/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 7216.77 MiB
- load_tensors: CUDA0 model buffer size = 18001.09 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2521
- llama_context: n_ctx_per_seq = 2521
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2521) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2528, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 227.12 MiB
- llama_kv_cache_unified: CPU KV buffer size = 88.88 MiB
- llama_kv_cache_unified: KV self size = 316.00 MiB, K (f16): 158.00 MiB, V (f16): 158.00 MiB
- llama_context: CUDA0 compute buffer size = 393.94 MiB
- llama_context: CUDA_Host compute buffer size = 12.94 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 112 (with bs=512), 3 (with bs=1)
- time=2025-07-19T18:09:06.459+02:00 level=INFO source=server.go:637 msg="llama runner started in 4.51 seconds"
- time=2025-07-19T18:09:06.462+02:00 level=WARN source=runner.go:128 msg="truncating input prompt" limit=2521 prompt=3125 keep=5 new=2521
- [GIN] 2025/07/19 - 18:09:19 | 200 | 18.661376s | 100.107.36.63 | POST "/api/generate"
- time=2025-07-19T18:09:50.430+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.6 GiB" free_swap="31.2 GiB"
- time=2025-07-19T18:09:50.430+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=23 layers.split="" memory.available="[20.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.8 GiB" memory.required.partial="19.8 GiB" memory.required.kv="256.0 MiB" memory.required.allocations="[19.8 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="826.0 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T18:09:50.454+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 1913 --batch-size 512 --n-gpu-layers 23 --threads 16 --no-mmap --parallel 1 --port 53442"
- time=2025-07-19T18:09:50.457+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T18:09:50.457+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T18:09:50.457+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T18:09:50.507+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T18:09:50.584+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T18:09:50.584+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:53442"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- time=2025-07-19T18:09:50.708+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- load_tensors: offloading 23 repeating layers to GPU
- load_tensors: offloaded 23/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 7216.77 MiB
- load_tensors: CUDA0 model buffer size = 18001.09 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 1913
- llama_context: n_ctx_per_seq = 1913
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (1913) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 1920, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 172.50 MiB
- llama_kv_cache_unified: CPU KV buffer size = 67.50 MiB
- llama_kv_cache_unified: KV self size = 240.00 MiB, K (f16): 120.00 MiB, V (f16): 120.00 MiB
- llama_context: CUDA0 compute buffer size = 405.00 MiB
- llama_context: CUDA_Host compute buffer size = 11.76 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 112 (with bs=512), 3 (with bs=1)
- time=2025-07-19T18:09:54.716+02:00 level=INFO source=server.go:637 msg="llama runner started in 4.26 seconds"
- [GIN] 2025/07/19 - 18:10:02 | 200 | 12.9362782s | 100.107.36.63 | POST "/api/generate"
- time=2025-07-19T18:10:39.725+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.4 GiB" free_swap="30.7 GiB"
- time=2025-07-19T18:10:39.726+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=22 layers.split="" memory.available="[19.8 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.9 GiB" memory.required.partial="19.1 GiB" memory.required.kv="321.6 MiB" memory.required.allocations="[19.1 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="197.8 MiB" memory.graph.partial="827.0 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T18:10:39.750+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2573 --batch-size 512 --n-gpu-layers 22 --threads 16 --no-mmap --parallel 1 --port 53489"
- time=2025-07-19T18:10:39.753+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T18:10:39.753+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T18:10:39.753+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T18:10:39.808+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T18:10:39.893+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T18:10:39.893+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:53489"
- time=2025-07-19T18:10:40.004+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- load_tensors: offloading 22 repeating layers to GPU
- load_tensors: offloaded 22/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 7999.43 MiB
- load_tensors: CUDA0 model buffer size = 17218.44 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2573
- llama_context: n_ctx_per_seq = 2573
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2573) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2592, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 222.75 MiB
- llama_kv_cache_unified: CPU KV buffer size = 101.25 MiB
- llama_kv_cache_unified: KV self size = 324.00 MiB, K (f16): 162.00 MiB, V (f16): 162.00 MiB
- llama_context: CUDA0 compute buffer size = 394.06 MiB
- llama_context: CUDA_Host compute buffer size = 13.07 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 124 (with bs=512), 3 (with bs=1)
- time=2025-07-19T18:10:56.035+02:00 level=INFO source=server.go:637 msg="llama runner started in 16.28 seconds"
- time=2025-07-19T18:10:56.040+02:00 level=WARN source=runner.go:128 msg="truncating input prompt" limit=2573 prompt=2966 keep=5 new=2573
- [GIN] 2025/07/19 - 18:11:08 | 200 | 0s | 192.168.1.1 | GET "/"
- [GIN] 2025/07/19 - 18:11:10 | 200 | 31.4398242s | 100.107.36.63 | POST "/api/generate"
- time=2025-07-19T18:11:43.020+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.6 GiB" free_swap="31.1 GiB"
- time=2025-07-19T18:11:43.021+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=23 layers.split="" memory.available="[20.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.9 GiB" memory.required.partial="19.8 GiB" memory.required.kv="279.4 MiB" memory.required.allocations="[19.8 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="176.1 MiB" memory.graph.partial="826.4 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T18:11:43.045+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2235 --batch-size 512 --n-gpu-layers 23 --threads 16 --no-mmap --parallel 1 --port 53552"
- time=2025-07-19T18:11:43.048+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T18:11:43.048+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T18:11:43.048+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T18:11:43.102+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T18:11:43.175+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T18:11:43.176+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:53552"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- time=2025-07-19T18:11:43.304+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load_tensors: offloading 23 repeating layers to GPU
- load_tensors: offloaded 23/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 7216.77 MiB
- load_tensors: CUDA0 model buffer size = 18001.09 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2235
- llama_context: n_ctx_per_seq = 2235
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2235) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2240, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 201.25 MiB
- llama_kv_cache_unified: CPU KV buffer size = 78.75 MiB
- llama_kv_cache_unified: KV self size = 280.00 MiB, K (f16): 140.00 MiB, V (f16): 140.00 MiB
- llama_context: CUDA0 compute buffer size = 393.38 MiB
- llama_context: CUDA_Host compute buffer size = 12.38 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 112 (with bs=512), 3 (with bs=1)
- time=2025-07-19T18:11:47.813+02:00 level=INFO source=server.go:637 msg="llama runner started in 4.76 seconds"
- time=2025-07-19T18:11:47.816+02:00 level=WARN source=runner.go:128 msg="truncating input prompt" limit=2235 prompt=2503 keep=5 new=2235
- [GIN] 2025/07/19 - 18:12:02 | 200 | 20.2151959s | 100.107.36.63 | POST "/api/generate"
- time=2025-07-19T18:12:39.591+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.6 GiB" free_swap="31.2 GiB"
- time=2025-07-19T18:12:39.591+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=23 layers.split="" memory.available="[20.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.9 GiB" memory.required.partial="19.8 GiB" memory.required.kv="339.2 MiB" memory.required.allocations="[19.8 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="206.9 MiB" memory.graph.partial="827.3 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T18:12:39.617+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2714 --batch-size 512 --n-gpu-layers 23 --threads 16 --no-mmap --parallel 1 --port 53589"
- time=2025-07-19T18:12:39.620+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T18:12:39.620+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T18:12:39.620+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T18:12:39.676+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T18:12:39.757+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T18:12:39.758+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:53589"
- time=2025-07-19T18:12:39.871+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- load_tensors: offloading 23 repeating layers to GPU
- load_tensors: offloaded 23/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 7216.77 MiB
- load_tensors: CUDA0 model buffer size = 18001.09 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2714
- llama_context: n_ctx_per_seq = 2714
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2714) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2720, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 244.38 MiB
- llama_kv_cache_unified: CPU KV buffer size = 95.62 MiB
- llama_kv_cache_unified: KV self size = 340.00 MiB, K (f16): 170.00 MiB, V (f16): 170.00 MiB
- llama_context: CUDA0 compute buffer size = 394.31 MiB
- llama_context: CUDA_Host compute buffer size = 13.32 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 112 (with bs=512), 3 (with bs=1)
- time=2025-07-19T18:12:44.130+02:00 level=INFO source=server.go:637 msg="llama runner started in 4.51 seconds"
- time=2025-07-19T18:12:44.134+02:00 level=WARN source=runner.go:128 msg="truncating input prompt" limit=2714 prompt=3489 keep=5 new=2714
- [GIN] 2025/07/19 - 18:12:53 | 200 | 14.7450462s | 100.107.36.63 | POST "/api/generate"
- [GIN] 2025/07/19 - 18:13:17 | 200 | 635.6µs | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 18:13:17 | 200 | 0s | 192.168.1.2 | GET "/api/ps"
- [GIN] 2025/07/19 - 18:13:17 | 200 | 0s | 192.168.1.2 | GET "/api/version"
- [GIN] 2025/07/19 - 18:13:18 | 200 | 511.7µs | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 18:13:18 | 200 | 0s | 192.168.1.2 | GET "/api/ps"
- [GIN] 2025/07/19 - 18:13:21 | 200 | 557.6µs | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 18:13:21 | 200 | 0s | 192.168.1.2 | GET "/api/ps"
- [GIN] 2025/07/19 - 18:13:30 | 200 | 1.0038ms | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 18:13:30 | 200 | 0s | 192.168.1.2 | GET "/api/ps"
- time=2025-07-19T18:13:43.589+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.1 GiB" free_swap="30.6 GiB"
- time=2025-07-19T18:13:43.590+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=23 layers.split="" memory.available="[20.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.9 GiB" memory.required.partial="19.8 GiB" memory.required.kv="328.1 MiB" memory.required.allocations="[19.8 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="201.2 MiB" memory.graph.partial="827.1 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T18:13:43.614+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2625 --batch-size 512 --n-gpu-layers 23 --threads 16 --no-mmap --parallel 1 --port 53682"
- time=2025-07-19T18:13:43.617+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T18:13:43.617+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T18:13:43.617+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T18:13:43.667+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T18:13:43.749+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T18:13:43.750+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:53682"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- time=2025-07-19T18:13:43.868+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- load_tensors: offloading 23 repeating layers to GPU
- load_tensors: offloaded 23/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 7216.77 MiB
- load_tensors: CUDA0 model buffer size = 18001.09 MiB
- [GIN] 2025/07/19 - 18:13:52 | 200 | 518.5µs | 192.168.1.2 | GET "/api/tags"
- [GIN] 2025/07/19 - 18:13:52 | 200 | 0s | 192.168.1.2 | GET "/api/ps"
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2625
- llama_context: n_ctx_per_seq = 2625
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2625) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2656, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 238.62 MiB
- llama_kv_cache_unified: CPU KV buffer size = 93.38 MiB
- llama_kv_cache_unified: KV self size = 332.00 MiB, K (f16): 166.00 MiB, V (f16): 166.00 MiB
- llama_context: CUDA0 compute buffer size = 394.19 MiB
- llama_context: CUDA_Host compute buffer size = 13.19 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 112 (with bs=512), 3 (with bs=1)
- time=2025-07-19T18:13:58.898+02:00 level=INFO source=server.go:637 msg="llama runner started in 15.28 seconds"
- time=2025-07-19T18:13:58.903+02:00 level=WARN source=runner.go:128 msg="truncating input prompt" limit=2625 prompt=3240 keep=5 new=2625
- [GIN] 2025/07/19 - 18:14:14 | 200 | 31.14405s | 100.107.36.63 | POST "/api/generate"
- time=2025-07-19T18:15:50.453+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.6 GiB" free_swap="31.2 GiB"
- time=2025-07-19T18:15:50.453+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=23 layers.split="" memory.available="[20.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.9 GiB" memory.required.partial="19.8 GiB" memory.required.kv="313.9 MiB" memory.required.allocations="[19.8 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="193.8 MiB" memory.graph.partial="826.9 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T18:15:50.480+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2511 --batch-size 512 --n-gpu-layers 23 --threads 16 --no-mmap --parallel 1 --port 53709"
- time=2025-07-19T18:15:50.483+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T18:15:50.483+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T18:15:50.483+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T18:15:50.536+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T18:15:50.615+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T18:15:50.615+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:53709"
- time=2025-07-19T18:15:50.733+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- load_tensors: offloading 23 repeating layers to GPU
- load_tensors: offloaded 23/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 7216.77 MiB
- load_tensors: CUDA0 model buffer size = 18001.09 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2511
- llama_context: n_ctx_per_seq = 2511
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2511) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2528, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 227.12 MiB
- llama_kv_cache_unified: CPU KV buffer size = 88.88 MiB
- llama_kv_cache_unified: KV self size = 316.00 MiB, K (f16): 158.00 MiB, V (f16): 158.00 MiB
- llama_context: CUDA0 compute buffer size = 393.94 MiB
- llama_context: CUDA_Host compute buffer size = 12.94 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 112 (with bs=512), 3 (with bs=1)
- time=2025-07-19T18:15:55.240+02:00 level=INFO source=server.go:637 msg="llama runner started in 4.76 seconds"
- time=2025-07-19T18:15:55.243+02:00 level=WARN source=runner.go:128 msg="truncating input prompt" limit=2511 prompt=2835 keep=5 new=2511
- [GIN] 2025/07/19 - 18:16:08 | 200 | 0s | 192.168.1.1 | GET "/"
- [GIN] 2025/07/19 - 18:16:10 | 200 | 20.3360164s | 100.107.36.63 | POST "/api/generate"
- time=2025-07-19T18:16:11.200+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.6 GiB" free_swap="31.3 GiB"
- time=2025-07-19T18:16:11.200+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=23 layers.split="" memory.available="[20.1 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.8 GiB" memory.required.partial="19.8 GiB" memory.required.kv="256.0 MiB" memory.required.allocations="[19.8 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="164.0 MiB" memory.graph.partial="826.0 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
- time=2025-07-19T18:16:11.224+02:00 level=INFO source=server.go:438 msg="starting llama server" cmd="C:\\Users\\Haldi\\AppData\\Local\\Programs\\Ollama\\ollama.exe runner --model C:\\Users\\Haldi\\.ollama\\models\\blobs\\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 --ctx-size 2027 --batch-size 512 --n-gpu-layers 23 --threads 16 --no-mmap --parallel 1 --port 53712"
- time=2025-07-19T18:16:11.226+02:00 level=INFO source=sched.go:483 msg="loaded runners" count=1
- time=2025-07-19T18:16:11.226+02:00 level=INFO source=server.go:598 msg="waiting for llama runner to start responding"
- time=2025-07-19T18:16:11.227+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server error"
- time=2025-07-19T18:16:11.280+02:00 level=INFO source=runner.go:815 msg="starting go runner"
- ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
- ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
- ggml_cuda_init: found 1 CUDA devices:
- Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
- load_backend: loaded CUDA backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cuda.dll
- load_backend: loaded CPU backend from C:\Users\Haldi\AppData\Local\Programs\Ollama\lib\ollama\ggml-cpu-icelake.dll
- time=2025-07-19T18:16:11.355+02:00 level=INFO source=ggml.go:104 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.AVX512=1 CPU.0.AVX512_VBMI=1 CPU.0.AVX512_VNNI=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(clang)
- time=2025-07-19T18:16:11.356+02:00 level=INFO source=runner.go:874 msg="Server listening on 127.0.0.1:53712"
- llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4090) - 22994 MiB free
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 0
- print_info: n_ctx_train = 32768
- print_info: n_embd = 4096
- print_info: n_layer = 32
- print_info: n_head = 32
- print_info: n_head_kv = 8
- print_info: n_rot = 128
- print_info: n_swa = 0
- print_info: n_swa_pattern = 1
- print_info: n_embd_head_k = 128
- print_info: n_embd_head_v = 128
- print_info: n_gqa = 4
- print_info: n_embd_k_gqa = 1024
- print_info: n_embd_v_gqa = 1024
- print_info: f_norm_eps = 0.0e+00
- print_info: f_norm_rms_eps = 1.0e-05
- print_info: f_clamp_kqv = 0.0e+00
- print_info: f_max_alibi_bias = 0.0e+00
- print_info: f_logit_scale = 0.0e+00
- print_info: f_attn_scale = 0.0e+00
- print_info: n_ff = 14336
- print_info: n_expert = 8
- print_info: n_expert_used = 2
- print_info: causal attn = 1
- print_info: pooling type = 0
- print_info: rope type = 0
- print_info: rope scaling = linear
- print_info: freq_base_train = 1000000.0
- print_info: freq_scale_train = 1
- print_info: n_ctx_orig_yarn = 32768
- print_info: rope_finetuned = unknown
- print_info: ssm_d_conv = 0
- print_info: ssm_d_inner = 0
- print_info: ssm_d_state = 0
- print_info: ssm_dt_rank = 0
- print_info: ssm_dt_b_c_rms = 0
- print_info: model type = 8x7B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- load_tensors: loading model tensors, this can take a while... (mmap = false)
- time=2025-07-19T18:16:11.478+02:00 level=INFO source=server.go:632 msg="waiting for server to become available" status="llm server loading model"
- load_tensors: offloading 23 repeating layers to GPU
- load_tensors: offloaded 23/33 layers to GPU
- load_tensors: CUDA_Host model buffer size = 7216.77 MiB
- load_tensors: CUDA0 model buffer size = 18001.09 MiB
- llama_context: constructing llama_context
- llama_context: n_seq_max = 1
- llama_context: n_ctx = 2027
- llama_context: n_ctx_per_seq = 2027
- llama_context: n_batch = 512
- llama_context: n_ubatch = 512
- llama_context: causal_attn = 1
- llama_context: flash_attn = 0
- llama_context: freq_base = 1000000.0
- llama_context: freq_scale = 1
- llama_context: n_ctx_per_seq (2027) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
- llama_context: CPU output buffer size = 0.14 MiB
- llama_kv_cache_unified: kv_size = 2048, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1, padding = 32
- llama_kv_cache_unified: CUDA0 KV buffer size = 184.00 MiB
- llama_kv_cache_unified: CPU KV buffer size = 72.00 MiB
- llama_kv_cache_unified: KV self size = 256.00 MiB, K (f16): 128.00 MiB, V (f16): 128.00 MiB
- llama_context: CUDA0 compute buffer size = 405.00 MiB
- llama_context: CUDA_Host compute buffer size = 12.01 MiB
- llama_context: graph nodes = 1574
- llama_context: graph splits = 112 (with bs=512), 3 (with bs=1)
- time=2025-07-19T18:16:15.736+02:00 level=INFO source=server.go:637 msg="llama runner started in 4.51 seconds"
- [GIN] 2025/07/19 - 18:16:23 | 200 | 13.0084194s | 100.107.36.63 | POST "/api/generate"
- time=2025-07-19T18:17:45.516+02:00 level=INFO source=server.go:135 msg="system memory" total="47.1 GiB" free="32.3 GiB" free_swap="30.9 GiB"
- time=2025-07-19T18:17:45.517+02:00 level=INFO source=server.go:175 msg=offload library=cuda layers.requested=-1 layers.model=33 layers.offload=23 layers.split="" memory.available="[20.2 GiB]" memory.gpu_overhead="0 B" memory.required.full="26.9 GiB" memory.required.partial="19.8 GiB" memory.required.kv="328.1 MiB" memory.required.allocations="[19.8 GiB]" memory.weights.total="24.6 GiB" memory.weights.repeating="24.5 GiB" memory.weights.nonrepeating="102.6 MiB" memory.graph.full="201.2 MiB" memory.graph.partial="827.1 MiB"
- llama_model_loader: loaded meta data with 40 key-value pairs and 323 tensors from C:\Users\Haldi\.ollama\models\blobs\sha256-f2dc41fa964b42bfe34e9fb09c0acdcfbfd6e52f1332930b4eacc9d6ad1c6cd2 (version GGUF V3 (latest))
- llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
- llama_model_loader: - kv 0: general.architecture str = llama
- llama_model_loader: - kv 1: general.type str = model
- llama_model_loader: - kv 2: general.name str = Mixtral 8x7B Instruct v0.1
- llama_model_loader: - kv 3: general.version str = v0.1
- llama_model_loader: - kv 4: general.finetune str = Instruct
- llama_model_loader: - kv 5: general.basename str = Mixtral
- llama_model_loader: - kv 6: general.size_label str = 8x7B
- llama_model_loader: - kv 7: general.license str = apache-2.0
- llama_model_loader: - kv 8: general.base_model.count u32 = 1
- llama_model_loader: - kv 9: general.base_model.0.name str = Mixtral 8x7B v0.1
- llama_model_loader: - kv 10: general.base_model.0.version str = v0.1
- llama_model_loader: - kv 11: general.base_model.0.organization str = Mistralai
- llama_model_loader: - kv 12: general.base_model.0.repo_url str = https://huggingface.co/mistralai/Mixt...
- llama_model_loader: - kv 13: general.languages arr[str,5] = ["fr", "it", "de", "es", "en"]
- llama_model_loader: - kv 14: llama.block_count u32 = 32
- llama_model_loader: - kv 15: llama.context_length u32 = 32768
- llama_model_loader: - kv 16: llama.embedding_length u32 = 4096
- llama_model_loader: - kv 17: llama.feed_forward_length u32 = 14336
- llama_model_loader: - kv 18: llama.attention.head_count u32 = 32
- llama_model_loader: - kv 19: llama.attention.head_count_kv u32 = 8
- llama_model_loader: - kv 20: llama.rope.freq_base f32 = 1000000.000000
- llama_model_loader: - kv 21: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
- llama_model_loader: - kv 22: llama.expert_count u32 = 8
- llama_model_loader: - kv 23: llama.expert_used_count u32 = 2
- llama_model_loader: - kv 24: general.file_type u32 = 2
- llama_model_loader: - kv 25: llama.vocab_size u32 = 32000
- llama_model_loader: - kv 26: llama.rope.dimension_count u32 = 128
- llama_model_loader: - kv 27: tokenizer.ggml.model str = llama
- llama_model_loader: - kv 28: tokenizer.ggml.pre str = default
- llama_model_loader: - kv 29: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
- llama_model_loader: - kv 30: tokenizer.ggml.scores arr[f32,32000] = [-1000.000000, -1000.000000, -1000.00...
- llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,32000] = [3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
- llama_model_loader: - kv 32: tokenizer.ggml.bos_token_id u32 = 1
- llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 2
- llama_model_loader: - kv 34: tokenizer.ggml.unknown_token_id u32 = 0
- llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = true
- llama_model_loader: - kv 36: tokenizer.ggml.add_eos_token bool = false
- llama_model_loader: - kv 37: tokenizer.chat_template str = {%- if messages[0]['role'] == 'system...
- llama_model_loader: - kv 38: tokenizer.ggml.add_space_prefix bool = false
- llama_model_loader: - kv 39: general.quantization_version u32 = 2
- llama_model_loader: - type f32: 97 tensors
- llama_model_loader: - type q4_0: 161 tensors
- llama_model_loader: - type q8_0: 64 tensors
- llama_model_loader: - type q6_K: 1 tensors
- print_info: file format = GGUF V3 (latest)
- print_info: file type = Q4_0
- print_info: file size = 24.63 GiB (4.53 BPW)
- load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
- load: special tokens cache size = 3
- load: token to piece cache size = 0.1637 MB
- print_info: arch = llama
- print_info: vocab_only = 1
- print_info: model type = ?B
- print_info: model params = 46.70 B
- print_info: general.name = Mixtral 8x7B Instruct v0.1
- print_info: vocab type = SPM
- print_info: n_vocab = 32000
- print_info: n_merges = 0
- print_info: BOS token = 1 '<s>'
- print_info: EOS token = 2 '</s>'
- print_info: UNK token = 0 '<unk>'
- print_info: LF token = 13 '<0x0A>'
- print_info: EOG token = 2 '</s>'
- print_info: max token length = 48
- llama_model_load: vocab only - skipping tensors
Advertisement
Add Comment
Please, Sign In to add comment