Untitled

models:
  "qwen3-4b-thinking":
    name: "Qwen3 4B Thinking Q4.0"
    cmd: |
      /app/llama-server \
      --model /models/Qwen3-4B-Thinking-2507-Q4_0.gguf \
      --host 0.0.0.0 \
      --port ${PORT} \
      --no-mmap \
      --cache-type-k q4_0 \
      --cache-type-v q4_0 \
      --cache-reuse 512 \
      --no-webui
    ttl: 5
  "gpt-oss-20b-thinking":
    name: "GPT OSS 20B Thinking Q4.0 16K ctx"
    cmd: |
      /app/llama-server \
      --model /models/gpt-oss-20b-Q4_0.gguf \
      --host 0.0.0.0 \
      --port ${PORT} \
      --cache-type-k q4_0 \
      --cache-type-v q4_0 \
      --flash-attn on \
      --cache-reuse 512 \
      --ctx-size 16384
      --n-gpu-layers 10 \
      --n-cpu-moe 14
      --no-webui
    ttl: 5
  "ministral-14b-thinking":
    name: "Ministral 3 14B Thinking"
    cmd: |
      /app/llama-server \
      --model /models/Ministral-3-14B-Reasoning-2512-Q4_0.gguf \
      --host 0.0.0.0 \
      --port ${PORT} \
      --cache-type-k q4_0 \
      --cache-type-v q4_0 \
      --flash-attn on \
      --cache-reuse 512 \
      --ctx-size 8192 \
      --n-gpu-layers 35 \
      --no-mmap \
      --parallel 1 \
      --no-webui
    ttl: 120