Guest User

sglang for glm5.1 nvfp rtx6000 pros

a guest
Apr 25th, 2026
99
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.26 KB | None | 0 0
  1. inf-glm-5-1-nvfp4:
  2. image: inference-inf-glm-5-1-nvfp4:latest
  3. build:
  4. context: .
  5. dockerfile: Dockerfile.sglang
  6. args:
  7. SGLANG_BASE: voipmonitor/sglang:cu130
  8. restart: unless-stopped
  9. ports:
  10. - "30874:30874"
  11. environment:
  12. - NVIDIA_VISIBLE_DEVICES=0,1,2,3
  13. - NVIDIA_DRIVER_CAPABILITIES=compute,utility
  14. - CUDA_DEVICE_ORDER=PCI_BUS_ID
  15. - OMP_NUM_THREADS=8
  16. - SAFETENSORS_FAST_GPU=1
  17. - HF_HUB_OFFLINE=0
  18. - HF_TOKEN=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
  19. - SGLANG_ENABLE_JIT_DEEPGEMM=0
  20. - SGLANG_ENABLE_DEEP_GEMM=0
  21. - SGLANG_DISABLE_DEEP_GEMM=1
  22. - SGLANG_SET_CPU_AFFINITY=1
  23. - FLASHINFER_DISABLE_VERSION_CHECK=1
  24. - NCCL_IB_DISABLE=1
  25. - NCCL_P2P_DISABLE=0
  26. - NCCL_P2P_LEVEL=PIX
  27. - NCCL_SHM_DISABLE=0
  28. - NCCL_BUFFSIZE=4194304
  29. - NCCL_MIN_NCHANNELS=8
  30. - NCCL_SOCKET_IFNAME=lo
  31. - GLOO_SOCKET_IFNAME=lo
  32. - NCCL_CUMEM_HOST_ENABLE=0
  33. - TORCH_NCCL_BLOCKING_WAIT=1
  34. - TORCH_NCCL_ASYNC_ERROR_HANDLING=1
  35. - NVIDIA_TF32_OVERRIDE=1
  36. - SGLANG_ENABLE_SPEC_V2=1
  37. volumes:
  38. - /home/x/.cache/huggingface:/root/.cache/huggingface
  39. - ../data/glm-5-1-nvfp4:/data
  40. - ../data/glm-5-1-nvfp4/jit-cache:/cache/jit
  41. shm_size: 16gb
  42. ulimits:
  43. memlock:
  44. soft: -1
  45. hard: -1
  46. nofile:
  47. soft: 65536
  48. hard: 65536
  49. deploy:
  50. resources:
  51. reservations:
  52. devices:
  53. - driver: nvidia
  54. device_ids: ["0", "1", "2", "3"]
  55. capabilities: [gpu]
  56. healthcheck:
  57. test: ["CMD", "curl", "-f", "http://localhost:24701/health"]
  58. interval: 30s
  59. timeout: 10s
  60. retries: 5
  61. start_period: 300s
  62. logging:
  63. driver: json-file
  64. options:
  65. max-size: "100m"
  66. max-file: "3"
  67. command:
  68. - python3
  69. - -m
  70. - sglang.launch_server
  71. - --model-path
  72. - "0xSero/GLM-5.1-478B-A42B-REAP-NVFP4"
  73. - '--tensor-parallel-size'
  74. - '4'
  75. - '--quantization'
  76. - 'modelopt_fp4'
  77. - '--kv-cache-dtype'
  78. - 'fp8_e4m3'
  79. - '--chunked-prefill-size'
  80. - '4096'
  81. - '--page-size'
  82. - '128'
  83. - '--triton-attention-num-kv-splits'
  84. - '64'
  85. - '--moe-runner-backend'
  86. - 'cutlass'
  87. - '--fp4-gemm-backend'
  88. - 'flashinfer_cudnn'
  89. - '--cuda-graph-max-bs'
  90. - '4'
  91. - '--tool-call-parser'
  92. - 'glm47'
  93. - '--reasoning-parser'
  94. - 'glm45'
  95. - '--model-loader-extra-config'
  96. - '{"enable_multithread_load": true, "num_threads": 16}'
  97. - '--json-model-override-args'
  98. - '{"index_topk_pattern": "FFSFSSSFSSFFFSSSFFFSFSSSSSSFFSFFSFFSSFFFFFFSFFFFFSFFSSSSSSFSFFFSFSSSFSFFSFFSSS"}'
  99. - '--context-length'
  100. - '202752'
  101. - '--max-running-requests'
  102. - '2'
  103. - '--watchdog-timeout'
  104. - '1800'
  105. - --served-model-name
  106. - glm-5.1-nvfp4
  107. - --port
  108. - "30874"
  109. - --mem-fraction-static
  110. - "0.94"
  111. - --trust-remote-code
  112. - --host
  113. - "0.0.0.0"
  114. - --chat-template
  115. - "/root/.cache/huggingface/hub/models--0xSero--GLM-5.1-478B-A42B-REAP-NVFP4/snapshots/main/chat_template.jinja"
  116.  
Advertisement
Add Comment
Please, Sign In to add comment