Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- inf-glm-5-1-nvfp4:
- image: inference-inf-glm-5-1-nvfp4:latest
- build:
- context: .
- dockerfile: Dockerfile.sglang
- args:
- SGLANG_BASE: voipmonitor/sglang:cu130
- restart: unless-stopped
- ports:
- - "30874:30874"
- environment:
- - NVIDIA_VISIBLE_DEVICES=0,1,2,3
- - NVIDIA_DRIVER_CAPABILITIES=compute,utility
- - CUDA_DEVICE_ORDER=PCI_BUS_ID
- - OMP_NUM_THREADS=8
- - SAFETENSORS_FAST_GPU=1
- - HF_HUB_OFFLINE=0
- - HF_TOKEN=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
- - SGLANG_ENABLE_JIT_DEEPGEMM=0
- - SGLANG_ENABLE_DEEP_GEMM=0
- - SGLANG_DISABLE_DEEP_GEMM=1
- - SGLANG_SET_CPU_AFFINITY=1
- - FLASHINFER_DISABLE_VERSION_CHECK=1
- - NCCL_IB_DISABLE=1
- - NCCL_P2P_DISABLE=0
- - NCCL_P2P_LEVEL=PIX
- - NCCL_SHM_DISABLE=0
- - NCCL_BUFFSIZE=4194304
- - NCCL_MIN_NCHANNELS=8
- - NCCL_SOCKET_IFNAME=lo
- - GLOO_SOCKET_IFNAME=lo
- - NCCL_CUMEM_HOST_ENABLE=0
- - TORCH_NCCL_BLOCKING_WAIT=1
- - TORCH_NCCL_ASYNC_ERROR_HANDLING=1
- - NVIDIA_TF32_OVERRIDE=1
- - SGLANG_ENABLE_SPEC_V2=1
- volumes:
- - /home/x/.cache/huggingface:/root/.cache/huggingface
- - ../data/glm-5-1-nvfp4:/data
- - ../data/glm-5-1-nvfp4/jit-cache:/cache/jit
- shm_size: 16gb
- ulimits:
- memlock:
- soft: -1
- hard: -1
- nofile:
- soft: 65536
- hard: 65536
- deploy:
- resources:
- reservations:
- devices:
- - driver: nvidia
- device_ids: ["0", "1", "2", "3"]
- capabilities: [gpu]
- healthcheck:
- test: ["CMD", "curl", "-f", "http://localhost:24701/health"]
- interval: 30s
- timeout: 10s
- retries: 5
- start_period: 300s
- logging:
- driver: json-file
- options:
- max-size: "100m"
- max-file: "3"
- command:
- - python3
- - -m
- - sglang.launch_server
- - --model-path
- - "0xSero/GLM-5.1-478B-A42B-REAP-NVFP4"
- - '--tensor-parallel-size'
- - '4'
- - '--quantization'
- - 'modelopt_fp4'
- - '--kv-cache-dtype'
- - 'fp8_e4m3'
- - '--chunked-prefill-size'
- - '4096'
- - '--page-size'
- - '128'
- - '--triton-attention-num-kv-splits'
- - '64'
- - '--moe-runner-backend'
- - 'cutlass'
- - '--fp4-gemm-backend'
- - 'flashinfer_cudnn'
- - '--cuda-graph-max-bs'
- - '4'
- - '--tool-call-parser'
- - 'glm47'
- - '--reasoning-parser'
- - 'glm45'
- - '--model-loader-extra-config'
- - '{"enable_multithread_load": true, "num_threads": 16}'
- - '--json-model-override-args'
- - '{"index_topk_pattern": "FFSFSSSFSSFFFSSSFFFSFSSSSSSFFSFFSFFSSFFFFFFSFFFFFSFFSSSSSSFSFFFSFSSSFSFFSFFSSS"}'
- - '--context-length'
- - '202752'
- - '--max-running-requests'
- - '2'
- - '--watchdog-timeout'
- - '1800'
- - --served-model-name
- - glm-5.1-nvfp4
- - --port
- - "30874"
- - --mem-fraction-static
- - "0.94"
- - --trust-remote-code
- - --host
- - "0.0.0.0"
- - --chat-template
- - "/root/.cache/huggingface/hub/models--0xSero--GLM-5.1-478B-A42B-REAP-NVFP4/snapshots/main/chat_template.jinja"
Advertisement
Add Comment
Please, Sign In to add comment