bullerwins

Untitled

Sep 9th, 2025
31
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 99.34 KB | None | 0 0
  1. CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=1,0,2,4 VLLM_PP_LAYER_PARTITION="3,8,27,8" vllm serve \
  2. /mnt/llms/models/zai-org/GLM-4.5-Air-FP8/ \
  3. --served-model-name GLM \
  4. \
  5. --swap-space 16 \
  6. --max-num-seqs 512 \
  7. --max-model-len 8192 \
  8. --max-seq-len-to-capture 8192 \
  9. --gpu-memory-utilization 0.9 \
  10. -pp 4 \
  11. --trust-remote-code \
  12. --disable-log-requests \
  13. --host 0.0.0.0 \
  14. --port 8000 --enforce-eager
  15. INFO 09-09 20:28:26 [__init__.py:216] Automatically detected platform cuda.
  16. WARNING 09-09 20:28:29 [__init__.py:1756] argument '--disable-log-requests' is deprecated and replaced with '--enable-log-requests'. This will be removed in v0.12.0.
  17. (APIServer pid=511717) INFO 09-09 20:28:29 [api_server.py:1896] vLLM API server version 0.10.2rc2.dev191+g6fb278816.d20250909
  18. (APIServer pid=511717) INFO 09-09 20:28:29 [utils.py:328] non-default args: {'model_tag': '/mnt/llms/models/zai-org/GLM-4.5-Air-FP8/', 'host': '0.0.0.0', 'model': '/mnt/llms/models/zai-org/GLM-4.5-Air-FP8/', 'trust_remote_code': True, 'max_model_len': 8192, 'enforce_eager': True, 'served_model_name': ['GLM'], 'pipeline_parallel_size': 4, 'swap_space': 16.0, 'max_num_seqs': 512}
  19. (APIServer pid=511717) The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
  20. (APIServer pid=511717) INFO 09-09 20:28:35 [__init__.py:744] Resolved architecture: Glm4MoeForCausalLM
  21. (APIServer pid=511717) `torch_dtype` is deprecated! Use `dtype` instead!
  22. (APIServer pid=511717) INFO 09-09 20:28:35 [__init__.py:1772] Using max model len 8192
  23. (APIServer pid=511717) WARNING 09-09 20:28:35 [_ipex_ops.py:16] Import error msg: No module named 'intel_extension_for_pytorch'
  24. (APIServer pid=511717) INFO 09-09 20:28:35 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=2048.
  25. (APIServer pid=511717) INFO 09-09 20:28:35 [__init__.py:3499] Cudagraph is disabled under eager mode
  26. INFO 09-09 20:28:40 [__init__.py:216] Automatically detected platform cuda.
  27. (EngineCore_DP0 pid=511959) INFO 09-09 20:28:42 [core.py:654] Waiting for init message from front-end.
  28. (EngineCore_DP0 pid=511959) INFO 09-09 20:28:42 [core.py:76] Initializing a V1 LLM engine (v0.10.2rc2.dev191+g6fb278816.d20250909) with config: model='/mnt/llms/models/zai-org/GLM-4.5-Air-FP8/', speculative_config=None, tokenizer='/mnt/llms/models/zai-org/GLM-4.5-Air-FP8/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=4, data_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=GLM, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=False, pooler_config=None, compilation_config={"level":0,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":null,"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":0,"use_cudagraph":true,"cudagraph_num_of_warmups":0,"cudagraph_capture_sizes":[],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":0,"local_cache_dir":null}
  29. (EngineCore_DP0 pid=511959) WARNING 09-09 20:28:42 [multiproc_worker_utils.py:273] Reducing Torch parallelism from 24 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.
  30. (EngineCore_DP0 pid=511959) INFO 09-09 20:28:42 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, 'psm_1a765819'), local_subscribe_addr='ipc:///tmp/d78e8889-6bf4-43df-a30a-613b5ff31999', remote_subscribe_addr=None, remote_addr_ipv6=False)
  31. INFO 09-09 20:28:46 [__init__.py:216] Automatically detected platform cuda.
  32. INFO 09-09 20:28:46 [__init__.py:216] Automatically detected platform cuda.
  33. INFO 09-09 20:28:46 [__init__.py:216] Automatically detected platform cuda.
  34. INFO 09-09 20:28:46 [__init__.py:216] Automatically detected platform cuda.
  35. INFO 09-09 20:28:49 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_a95d08bb'), local_subscribe_addr='ipc:///tmp/542d5194-cf38-44d5-9e62-cdd15b8bd909', remote_subscribe_addr=None, remote_addr_ipv6=False)
  36. INFO 09-09 20:28:49 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_98358131'), local_subscribe_addr='ipc:///tmp/b351aa86-c7e1-461c-9806-50449e56535f', remote_subscribe_addr=None, remote_addr_ipv6=False)
  37. INFO 09-09 20:28:50 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_bc961b2b'), local_subscribe_addr='ipc:///tmp/77374ef5-7f91-441b-88ad-95ac9000581f', remote_subscribe_addr=None, remote_addr_ipv6=False)
  38. INFO 09-09 20:28:50 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_8700974d'), local_subscribe_addr='ipc:///tmp/16999e09-b4d5-498a-ba40-055b6bebe96a', remote_subscribe_addr=None, remote_addr_ipv6=False)
  39. [W909 20:28:50.304204817 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:60461 (errno: 97 - Address family not supported by protocol).
  40. [W909 20:28:50.377584382 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:60461 (errno: 97 - Address family not supported by protocol).
  41. [W909 20:28:50.395981055 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:60461 (errno: 97 - Address family not supported by protocol).
  42. [W909 20:28:50.400721762 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [localhost]:60461 (errno: 97 - Address family not supported by protocol).
  43. [W909 20:28:50.401253860 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
  44. [W909 20:28:50.704116670 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
  45. [W909 20:28:51.046605499 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
  46. [W909 20:28:51.052565047 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
  47. [Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
  48. [Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
  49. [Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
  50. [Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
  51. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  52. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  53. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  54. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  55. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  56. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  57. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  58. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  59. [Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
  60. [Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
  61. [Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
  62. [Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
  63. INFO 09-09 20:28:51 [__init__.py:1432] Found nccl from library libnccl.so.2
  64. INFO 09-09 20:28:51 [__init__.py:1432] Found nccl from library libnccl.so.2
  65. INFO 09-09 20:28:51 [pynccl.py:70] vLLM is using nccl==2.27.3
  66. INFO 09-09 20:28:51 [pynccl.py:70] vLLM is using nccl==2.27.3
  67. INFO 09-09 20:28:51 [__init__.py:1432] Found nccl from library libnccl.so.2
  68. INFO 09-09 20:28:51 [__init__.py:1432] Found nccl from library libnccl.so.2
  69. INFO 09-09 20:28:51 [pynccl.py:70] vLLM is using nccl==2.27.3
  70. INFO 09-09 20:28:51 [pynccl.py:70] vLLM is using nccl==2.27.3
  71. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  72. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  73. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  74. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  75. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  76. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  77. INFO 09-09 20:28:51 [parallel_state.py:1164] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
  78. INFO 09-09 20:28:51 [parallel_state.py:1164] rank 2 in world size 4 is assigned as DP rank 0, PP rank 2, TP rank 0, EP rank 0
  79. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  80. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  81. INFO 09-09 20:28:51 [parallel_state.py:1164] rank 3 in world size 4 is assigned as DP rank 0, PP rank 3, TP rank 0, EP rank 0
  82. INFO 09-09 20:28:51 [parallel_state.py:1164] rank 1 in world size 4 is assigned as DP rank 0, PP rank 1, TP rank 0, EP rank 0
  83. WARNING 09-09 20:28:51 [topk_topp_sampler.py:69] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
  84. WARNING 09-09 20:28:51 [topk_topp_sampler.py:69] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
  85. WARNING 09-09 20:28:51 [topk_topp_sampler.py:69] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
  86. WARNING 09-09 20:28:51 [topk_topp_sampler.py:69] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
  87. (Worker_PP2 pid=512070) INFO 09-09 20:28:51 [gpu_model_runner.py:2178] Starting to load model /mnt/llms/models/zai-org/GLM-4.5-Air-FP8/...
  88. (Worker_PP0 pid=512068) INFO 09-09 20:28:51 [gpu_model_runner.py:2178] Starting to load model /mnt/llms/models/zai-org/GLM-4.5-Air-FP8/...
  89. (Worker_PP3 pid=512071) INFO 09-09 20:28:51 [gpu_model_runner.py:2178] Starting to load model /mnt/llms/models/zai-org/GLM-4.5-Air-FP8/...
  90. (Worker_PP1 pid=512069) INFO 09-09 20:28:51 [gpu_model_runner.py:2178] Starting to load model /mnt/llms/models/zai-org/GLM-4.5-Air-FP8/...
  91. (Worker_PP3 pid=512071) INFO 09-09 20:28:52 [gpu_model_runner.py:2210] Loading model from scratch...
  92. (Worker_PP1 pid=512069) INFO 09-09 20:28:52 [gpu_model_runner.py:2210] Loading model from scratch...
  93. (Worker_PP2 pid=512070) INFO 09-09 20:28:52 [gpu_model_runner.py:2210] Loading model from scratch...
  94. (Worker_PP0 pid=512068) INFO 09-09 20:28:52 [gpu_model_runner.py:2210] Loading model from scratch...
  95. (Worker_PP1 pid=512069) INFO 09-09 20:28:52 [cuda.py:340] Using Flash Attention backend on V1 engine.
  96. (Worker_PP3 pid=512071) INFO 09-09 20:28:52 [cuda.py:340] Using Flash Attention backend on V1 engine.
  97. (Worker_PP2 pid=512070) INFO 09-09 20:28:52 [cuda.py:340] Using Flash Attention backend on V1 engine.
  98. (Worker_PP0 pid=512068) INFO 09-09 20:28:52 [cuda.py:340] Using Flash Attention backend on V1 engine.
  99. Loading safetensors checkpoint shards: 0% Completed | 0/47 [00:00<?, ?it/s]
  100. Loading safetensors checkpoint shards: 6% Completed | 3/47 [00:00<00:01, 25.55it/s]
  101. Loading safetensors checkpoint shards: 13% Completed | 6/47 [00:00<00:04, 8.32it/s]
  102. Loading safetensors checkpoint shards: 17% Completed | 8/47 [00:00<00:04, 8.86it/s]
  103. Loading safetensors checkpoint shards: 23% Completed | 11/47 [00:00<00:03, 11.87it/s]
  104. Loading safetensors checkpoint shards: 30% Completed | 14/47 [00:01<00:02, 14.22it/s]
  105. Loading safetensors checkpoint shards: 36% Completed | 17/47 [00:01<00:01, 16.08it/s]
  106. Loading safetensors checkpoint shards: 43% Completed | 20/47 [00:01<00:01, 17.59it/s]
  107. Loading safetensors checkpoint shards: 47% Completed | 22/47 [00:01<00:02, 8.87it/s]
  108. Loading safetensors checkpoint shards: 53% Completed | 25/47 [00:02<00:01, 11.07it/s]
  109. Loading safetensors checkpoint shards: 60% Completed | 28/47 [00:02<00:01, 13.09it/s]
  110. Loading safetensors checkpoint shards: 66% Completed | 31/47 [00:02<00:01, 14.85it/s]
  111. Loading safetensors checkpoint shards: 72% Completed | 34/47 [00:02<00:00, 16.28it/s]
  112. Loading safetensors checkpoint shards: 77% Completed | 36/47 [00:02<00:00, 12.97it/s]
  113. Loading safetensors checkpoint shards: 81% Completed | 38/47 [00:02<00:00, 12.87it/s]
  114. Loading safetensors checkpoint shards: 87% Completed | 41/47 [00:03<00:00, 14.77it/s]
  115. Loading safetensors checkpoint shards: 94% Completed | 44/47 [00:03<00:00, 16.33it/s]
  116. Loading safetensors checkpoint shards: 100% Completed | 47/47 [00:03<00:00, 17.46it/s]
  117. Loading safetensors checkpoint shards: 100% Completed | 47/47 [00:03<00:00, 13.75it/s]
  118. (Worker_PP0 pid=512068)
  119. (Worker_PP0 pid=512068) INFO 09-09 20:28:55 [default_loader.py:266] Loading weights took 3.42 seconds
  120. (Worker_PP0 pid=512068) WARNING 09-09 20:28:55 [marlin_utils_fp8.py:80] Your GPU does not have native support for FP8 computation but FP8 quantization is being used. Weight-only FP8 compression will be used leveraging the Marlin kernel. This may degrade performance for compute-heavy workloads.
  121. (Worker_PP0 pid=512068) INFO 09-09 20:28:56 [gpu_model_runner.py:2232] Model loading took 5.7690 GiB and 3.784014 seconds
  122. (Worker_PP1 pid=512069) INFO 09-09 20:28:58 [default_loader.py:266] Loading weights took 6.04 seconds
  123. (Worker_PP1 pid=512069) WARNING 09-09 20:28:58 [marlin_utils_fp8.py:80] Your GPU does not have native support for FP8 computation but FP8 quantization is being used. Weight-only FP8 compression will be used leveraging the Marlin kernel. This may degrade performance for compute-heavy workloads.
  124. (Worker_PP3 pid=512071) INFO 09-09 20:28:58 [default_loader.py:266] Loading weights took 6.39 seconds
  125. (Worker_PP3 pid=512071) WARNING 09-09 20:28:58 [marlin_utils_fp8.py:80] Your GPU does not have native support for FP8 computation but FP8 quantization is being used. Weight-only FP8 compression will be used leveraging the Marlin kernel. This may degrade performance for compute-heavy workloads.
  126. (Worker_PP1 pid=512069) INFO 09-09 20:28:59 [gpu_model_runner.py:2232] Model loading took 17.4959 GiB and 6.603870 seconds
  127. (Worker_PP3 pid=512071) INFO 09-09 20:28:59 [gpu_model_runner.py:2232] Model loading took 18.6522 GiB and 6.952122 seconds
  128. (Worker_PP2 pid=512070) INFO 09-09 20:29:07 [default_loader.py:266] Loading weights took 15.03 seconds
  129. (Worker_PP2 pid=512070) WARNING 09-09 20:29:07 [marlin_utils_fp8.py:80] Your GPU does not have native support for FP8 computation but FP8 quantization is being used. Weight-only FP8 compression will be used leveraging the Marlin kernel. This may degrade performance for compute-heavy workloads.
  130. (Worker_PP2 pid=512070) INFO 09-09 20:29:08 [gpu_model_runner.py:2232] Model loading took 59.0111 GiB and 16.300322 seconds
  131. (Worker_PP0 pid=512068) INFO 09-09 20:29:10 [marlin_utils.py:353] You are running Marlin kernel with bf16 on GPUs before SM90. You can consider change to fp16 to achieve better performance if possible.
  132. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] WorkerProc hit an exception.
  133. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] Traceback (most recent call last):
  134. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/executor/multiproc_executor.py", line 649, in worker_busy_loop
  135. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] output = func(*args, **kwargs)
  136. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  137. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
  138. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return func(*args, **kwargs)
  139. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  140. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_worker.py", line 244, in determine_available_memory
  141. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] self.model_runner.profile_run()
  142. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_model_runner.py", line 2847, in profile_run
  143. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] = self._dummy_run(self.max_num_tokens, is_profile=True)
  144. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  145. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
  146. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return func(*args, **kwargs)
  147. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  148. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_model_runner.py", line 2624, in _dummy_run
  149. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] outputs = self.model(
  150. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^
  151. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  152. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  153. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  154. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  155. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  156. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  157. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 672, in forward
  158. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states = self.model(input_ids, positions, intermediate_tensors,
  159. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  160. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/compilation/decorators.py", line 223, in __call__
  161. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self.forward(*args, **kwargs)
  162. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  163. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 449, in forward
  164. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states, residual = layer(positions, hidden_states, residual)
  165. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  166. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  167. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  168. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  169. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  170. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  171. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  172. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 376, in forward
  173. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states = self.mlp(hidden_states)
  174. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^
  175. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  176. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  177. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  178. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  179. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  180. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  181. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 190, in forward
  182. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] final_hidden_states = self.experts(
  183. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^
  184. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  185. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  186. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  187. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  188. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  189. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  190. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1615, in forward
  191. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] fused_output = torch.ops.vllm.moe_forward(
  192. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  193. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/_ops.py", line 1243, in __call__
  194. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._op(*args, **kwargs)
  195. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^
  196. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1881, in moe_forward
  197. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self.forward_impl(hidden_states, router_logits)
  198. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  199. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1772, in forward_impl
  200. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] final_hidden_states = self.quant_method.apply(
  201. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^
  202. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 929, in apply
  203. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return torch.ops.vllm.fused_marlin_moe(
  204. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  205. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/_ops.py", line 1243, in __call__
  206. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._op(*args, **kwargs)
  207. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^
  208. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py", line 204, in fused_marlin_moe
  209. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
  210. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  211. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] torch.AcceleratorError: CUDA error: an illegal memory access was encountered
  212. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
  213. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
  214. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
  215. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654]
  216. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] Traceback (most recent call last):
  217. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/executor/multiproc_executor.py", line 649, in worker_busy_loop
  218. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] output = func(*args, **kwargs)
  219. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  220. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
  221. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return func(*args, **kwargs)
  222. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  223. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_worker.py", line 244, in determine_available_memory
  224. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] self.model_runner.profile_run()
  225. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_model_runner.py", line 2847, in profile_run
  226. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] = self._dummy_run(self.max_num_tokens, is_profile=True)
  227. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  228. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
  229. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return func(*args, **kwargs)
  230. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  231. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_model_runner.py", line 2624, in _dummy_run
  232. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] outputs = self.model(
  233. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^
  234. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  235. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  236. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  237. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  238. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  239. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  240. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 672, in forward
  241. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states = self.model(input_ids, positions, intermediate_tensors,
  242. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  243. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/compilation/decorators.py", line 223, in __call__
  244. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self.forward(*args, **kwargs)
  245. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  246. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 449, in forward
  247. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states, residual = layer(positions, hidden_states, residual)
  248. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  249. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  250. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  251. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  252. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  253. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  254. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  255. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 376, in forward
  256. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states = self.mlp(hidden_states)
  257. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^
  258. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  259. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  260. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  261. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  262. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  263. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  264. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 190, in forward
  265. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] final_hidden_states = self.experts(
  266. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^
  267. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  268. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  269. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  270. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  271. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  272. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  273. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1615, in forward
  274. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] fused_output = torch.ops.vllm.moe_forward(
  275. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  276. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/_ops.py", line 1243, in __call__
  277. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._op(*args, **kwargs)
  278. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^
  279. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1881, in moe_forward
  280. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self.forward_impl(hidden_states, router_logits)
  281. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  282. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1772, in forward_impl
  283. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] final_hidden_states = self.quant_method.apply(
  284. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^
  285. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 929, in apply
  286. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return torch.ops.vllm.fused_marlin_moe(
  287. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  288. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/_ops.py", line 1243, in __call__
  289. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._op(*args, **kwargs)
  290. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^
  291. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py", line 204, in fused_marlin_moe
  292. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
  293. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  294. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] torch.AcceleratorError: CUDA error: an illegal memory access was encountered
  295. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
  296. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
  297. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
  298. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654]
  299. (Worker_PP1 pid=512069) ERROR 09-09 20:29:10 [multiproc_executor.py:654]
  300. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] WorkerProc hit an exception.
  301. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] Traceback (most recent call last):
  302. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/executor/multiproc_executor.py", line 649, in worker_busy_loop
  303. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] output = func(*args, **kwargs)
  304. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  305. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
  306. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return func(*args, **kwargs)
  307. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  308. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_worker.py", line 244, in determine_available_memory
  309. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] self.model_runner.profile_run()
  310. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_model_runner.py", line 2847, in profile_run
  311. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] = self._dummy_run(self.max_num_tokens, is_profile=True)
  312. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  313. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
  314. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return func(*args, **kwargs)
  315. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  316. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_model_runner.py", line 2624, in _dummy_run
  317. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] outputs = self.model(
  318. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^
  319. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  320. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  321. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  322. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  323. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  324. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  325. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 672, in forward
  326. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states = self.model(input_ids, positions, intermediate_tensors,
  327. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  328. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/compilation/decorators.py", line 223, in __call__
  329. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self.forward(*args, **kwargs)
  330. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  331. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 449, in forward
  332. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states, residual = layer(positions, hidden_states, residual)
  333. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  334. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  335. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  336. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  337. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  338. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  339. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  340. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 376, in forward
  341. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states = self.mlp(hidden_states)
  342. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^
  343. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  344. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  345. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  346. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  347. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  348. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  349. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 190, in forward
  350. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] final_hidden_states = self.experts(
  351. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^
  352. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  353. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  354. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  355. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  356. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  357. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  358. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1615, in forward
  359. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] fused_output = torch.ops.vllm.moe_forward(
  360. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  361. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/_ops.py", line 1243, in __call__
  362. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._op(*args, **kwargs)
  363. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^
  364. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1881, in moe_forward
  365. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self.forward_impl(hidden_states, router_logits)
  366. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  367. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1772, in forward_impl
  368. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] final_hidden_states = self.quant_method.apply(
  369. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^
  370. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 929, in apply
  371. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return torch.ops.vllm.fused_marlin_moe(
  372. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  373. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/_ops.py", line 1243, in __call__
  374. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._op(*args, **kwargs)
  375. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^
  376. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py", line 204, in fused_marlin_moe
  377. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
  378. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  379. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] torch.AcceleratorError: CUDA error: an illegal memory access was encountered
  380. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
  381. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
  382. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
  383. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654]
  384. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] Traceback (most recent call last):
  385. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/executor/multiproc_executor.py", line 649, in worker_busy_loop
  386. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] output = func(*args, **kwargs)
  387. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  388. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
  389. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return func(*args, **kwargs)
  390. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  391. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_worker.py", line 244, in determine_available_memory
  392. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] self.model_runner.profile_run()
  393. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_model_runner.py", line 2847, in profile_run
  394. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] = self._dummy_run(self.max_num_tokens, is_profile=True)
  395. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  396. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
  397. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return func(*args, **kwargs)
  398. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  399. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_model_runner.py", line 2624, in _dummy_run
  400. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] outputs = self.model(
  401. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^
  402. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  403. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  404. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  405. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  406. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  407. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  408. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 672, in forward
  409. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states = self.model(input_ids, positions, intermediate_tensors,
  410. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  411. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/compilation/decorators.py", line 223, in __call__
  412. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self.forward(*args, **kwargs)
  413. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  414. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 449, in forward
  415. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states, residual = layer(positions, hidden_states, residual)
  416. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  417. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  418. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  419. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  420. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  421. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  422. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  423. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 376, in forward
  424. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states = self.mlp(hidden_states)
  425. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^
  426. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  427. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  428. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  429. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  430. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  431. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  432. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 190, in forward
  433. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] final_hidden_states = self.experts(
  434. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^
  435. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  436. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  437. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  438. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  439. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  440. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  441. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1615, in forward
  442. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] fused_output = torch.ops.vllm.moe_forward(
  443. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  444. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/_ops.py", line 1243, in __call__
  445. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._op(*args, **kwargs)
  446. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^
  447. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1881, in moe_forward
  448. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self.forward_impl(hidden_states, router_logits)
  449. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  450. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1772, in forward_impl
  451. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] final_hidden_states = self.quant_method.apply(
  452. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^
  453. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 929, in apply
  454. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return torch.ops.vllm.fused_marlin_moe(
  455. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  456. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/_ops.py", line 1243, in __call__
  457. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._op(*args, **kwargs)
  458. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^
  459. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py", line 204, in fused_marlin_moe
  460. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
  461. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  462. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] torch.AcceleratorError: CUDA error: an illegal memory access was encountered
  463. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
  464. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
  465. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
  466. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654]
  467. (Worker_PP3 pid=512071) ERROR 09-09 20:29:10 [multiproc_executor.py:654]
  468. (Worker_PP0 pid=512068) INFO 09-09 20:29:10 [gpu_worker.py:276] Available KV cache memory: 15.07 GiB
  469. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] WorkerProc hit an exception.
  470. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] Traceback (most recent call last):
  471. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/executor/multiproc_executor.py", line 649, in worker_busy_loop
  472. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] output = func(*args, **kwargs)
  473. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  474. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
  475. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return func(*args, **kwargs)
  476. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  477. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_worker.py", line 244, in determine_available_memory
  478. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] self.model_runner.profile_run()
  479. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_model_runner.py", line 2847, in profile_run
  480. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] = self._dummy_run(self.max_num_tokens, is_profile=True)
  481. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  482. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
  483. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return func(*args, **kwargs)
  484. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  485. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_model_runner.py", line 2624, in _dummy_run
  486. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] outputs = self.model(
  487. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^
  488. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  489. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  490. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  491. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  492. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  493. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  494. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 672, in forward
  495. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states = self.model(input_ids, positions, intermediate_tensors,
  496. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  497. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/compilation/decorators.py", line 223, in __call__
  498. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self.forward(*args, **kwargs)
  499. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  500. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 449, in forward
  501. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states, residual = layer(positions, hidden_states, residual)
  502. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  503. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  504. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  505. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  506. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  507. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  508. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  509. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 376, in forward
  510. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states = self.mlp(hidden_states)
  511. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^
  512. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  513. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  514. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  515. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  516. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  517. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  518. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 190, in forward
  519. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] final_hidden_states = self.experts(
  520. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^
  521. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  522. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  523. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  524. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  525. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  526. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  527. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1615, in forward
  528. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] fused_output = torch.ops.vllm.moe_forward(
  529. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  530. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/_ops.py", line 1243, in __call__
  531. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._op(*args, **kwargs)
  532. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^
  533. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1881, in moe_forward
  534. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self.forward_impl(hidden_states, router_logits)
  535. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  536. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1772, in forward_impl
  537. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] final_hidden_states = self.quant_method.apply(
  538. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^
  539. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 929, in apply
  540. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return torch.ops.vllm.fused_marlin_moe(
  541. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  542. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/_ops.py", line 1243, in __call__
  543. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._op(*args, **kwargs)
  544. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^
  545. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py", line 204, in fused_marlin_moe
  546. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
  547. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  548. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] torch.AcceleratorError: CUDA error: an illegal memory access was encountered
  549. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
  550. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
  551. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
  552. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654]
  553. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] Traceback (most recent call last):
  554. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/executor/multiproc_executor.py", line 649, in worker_busy_loop
  555. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] output = func(*args, **kwargs)
  556. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  557. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
  558. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return func(*args, **kwargs)
  559. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  560. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_worker.py", line 244, in determine_available_memory
  561. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] self.model_runner.profile_run()
  562. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_model_runner.py", line 2847, in profile_run
  563. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] = self._dummy_run(self.max_num_tokens, is_profile=True)
  564. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  565. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
  566. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return func(*args, **kwargs)
  567. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^
  568. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/v1/worker/gpu_model_runner.py", line 2624, in _dummy_run
  569. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] outputs = self.model(
  570. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^
  571. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  572. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  573. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  574. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  575. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  576. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  577. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 672, in forward
  578. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states = self.model(input_ids, positions, intermediate_tensors,
  579. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  580. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/compilation/decorators.py", line 223, in __call__
  581. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self.forward(*args, **kwargs)
  582. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  583. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 449, in forward
  584. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states, residual = layer(positions, hidden_states, residual)
  585. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  586. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  587. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  588. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  589. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  590. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  591. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  592. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 376, in forward
  593. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] hidden_states = self.mlp(hidden_states)
  594. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^
  595. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  596. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  597. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  598. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  599. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  600. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  601. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/models/glm4_moe.py", line 190, in forward
  602. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] final_hidden_states = self.experts(
  603. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^
  604. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
  605. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._call_impl(*args, **kwargs)
  606. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  607. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
  608. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return forward_call(*args, **kwargs)
  609. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  610. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1615, in forward
  611. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] fused_output = torch.ops.vllm.moe_forward(
  612. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  613. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/_ops.py", line 1243, in __call__
  614. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._op(*args, **kwargs)
  615. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^
  616. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1881, in moe_forward
  617. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self.forward_impl(hidden_states, router_logits)
  618. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  619. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/layer.py", line 1772, in forward_impl
  620. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] final_hidden_states = self.quant_method.apply(
  621. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^
  622. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 929, in apply
  623. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return torch.ops.vllm.fused_marlin_moe(
  624. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  625. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/torch/_ops.py", line 1243, in __call__
  626. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return self._op(*args, **kwargs)
  627. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^
  628. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] File "/home/ubuntuai/vllm_source/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py", line 204, in fused_marlin_moe
  629. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
  630. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  631. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] torch.AcceleratorError: CUDA error: an illegal memory access was encountered
  632. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
  633. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
  634. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
  635. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654]
  636. (Worker_PP2 pid=512070) ERROR 09-09 20:29:10 [multiproc_executor.py:654]
  637. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] EngineCore failed to start.
  638. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] Traceback (most recent call last):
  639. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] File "/home/ubuntuai/vllm_source/vllm/v1/engine/core.py", line 709, in run_engine_core
  640. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] engine_core = EngineCoreProc(*args, **kwargs)
  641. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  642. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] File "/home/ubuntuai/vllm_source/vllm/v1/engine/core.py", line 505, in __init__
  643. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] super().__init__(vllm_config, executor_class, log_stats,
  644. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] File "/home/ubuntuai/vllm_source/vllm/v1/engine/core.py", line 91, in __init__
  645. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] self._initialize_kv_caches(vllm_config)
  646. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] File "/home/ubuntuai/vllm_source/vllm/v1/engine/core.py", line 183, in _initialize_kv_caches
  647. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] self.model_executor.determine_available_memory())
  648. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  649. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] File "/home/ubuntuai/vllm_source/vllm/v1/executor/abstract.py", line 84, in determine_available_memory
  650. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] return self.collective_rpc("determine_available_memory")
  651. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  652. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] File "/home/ubuntuai/vllm_source/vllm/v1/executor/multiproc_executor.py", line 257, in collective_rpc
  653. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] result = result.result()
  654. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] ^^^^^^^^^^^^^^^
  655. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] File "/usr/lib/python3.12/concurrent/futures/_base.py", line 456, in result
  656. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] return self.__get_result()
  657. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] ^^^^^^^^^^^^^^^^^^^
  658. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] File "/usr/lib/python3.12/concurrent/futures/_base.py", line 401, in __get_result
  659. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] raise self._exception
  660. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] File "/usr/lib/python3.12/concurrent/futures/thread.py", line 59, in run
  661. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] result = self.fn(*self.args, **self.kwargs)
  662. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  663. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] File "/home/ubuntuai/vllm_source/vllm/v1/executor/multiproc_executor.py", line 243, in get_response
  664. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] raise RuntimeError(
  665. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] RuntimeError: Worker failed with error 'CUDA error: an illegal memory access was encountered
  666. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
  667. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
  668. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
  669. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:11 [core.py:718] ', please check the stack trace above for the root cause
  670. (EngineCore_DP0 pid=511959) ERROR 09-09 20:29:12 [multiproc_executor.py:149] Worker proc VllmWorker-1 died unexpectedly, shutting down executor.
  671. (EngineCore_DP0 pid=511959) Process EngineCore_DP0:
  672. (EngineCore_DP0 pid=511959) Traceback (most recent call last):
  673. (EngineCore_DP0 pid=511959) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
  674. (EngineCore_DP0 pid=511959) self.run()
  675. (EngineCore_DP0 pid=511959) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
  676. (EngineCore_DP0 pid=511959) self._target(*self._args, **self._kwargs)
  677. (EngineCore_DP0 pid=511959) File "/home/ubuntuai/vllm_source/vllm/v1/engine/core.py", line 722, in run_engine_core
  678. (EngineCore_DP0 pid=511959) raise e
  679. (EngineCore_DP0 pid=511959) File "/home/ubuntuai/vllm_source/vllm/v1/engine/core.py", line 709, in run_engine_core
  680. (EngineCore_DP0 pid=511959) engine_core = EngineCoreProc(*args, **kwargs)
  681. (EngineCore_DP0 pid=511959) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  682. (EngineCore_DP0 pid=511959) File "/home/ubuntuai/vllm_source/vllm/v1/engine/core.py", line 505, in __init__
  683. (EngineCore_DP0 pid=511959) super().__init__(vllm_config, executor_class, log_stats,
  684. (EngineCore_DP0 pid=511959) File "/home/ubuntuai/vllm_source/vllm/v1/engine/core.py", line 91, in __init__
  685. (EngineCore_DP0 pid=511959) self._initialize_kv_caches(vllm_config)
  686. (EngineCore_DP0 pid=511959) File "/home/ubuntuai/vllm_source/vllm/v1/engine/core.py", line 183, in _initialize_kv_caches
  687. (EngineCore_DP0 pid=511959) self.model_executor.determine_available_memory())
  688. (EngineCore_DP0 pid=511959) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  689. (EngineCore_DP0 pid=511959) File "/home/ubuntuai/vllm_source/vllm/v1/executor/abstract.py", line 84, in determine_available_memory
  690. (EngineCore_DP0 pid=511959) return self.collective_rpc("determine_available_memory")
  691. (EngineCore_DP0 pid=511959) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  692. (EngineCore_DP0 pid=511959) File "/home/ubuntuai/vllm_source/vllm/v1/executor/multiproc_executor.py", line 257, in collective_rpc
  693. (EngineCore_DP0 pid=511959) result = result.result()
  694. (EngineCore_DP0 pid=511959) ^^^^^^^^^^^^^^^
  695. (EngineCore_DP0 pid=511959) File "/usr/lib/python3.12/concurrent/futures/_base.py", line 456, in result
  696. (EngineCore_DP0 pid=511959) return self.__get_result()
  697. (EngineCore_DP0 pid=511959) ^^^^^^^^^^^^^^^^^^^
  698. (EngineCore_DP0 pid=511959) File "/usr/lib/python3.12/concurrent/futures/_base.py", line 401, in __get_result
  699. (EngineCore_DP0 pid=511959) raise self._exception
  700. (EngineCore_DP0 pid=511959) File "/usr/lib/python3.12/concurrent/futures/thread.py", line 59, in run
  701. (EngineCore_DP0 pid=511959) result = self.fn(*self.args, **self.kwargs)
  702. (EngineCore_DP0 pid=511959) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  703. (EngineCore_DP0 pid=511959) File "/home/ubuntuai/vllm_source/vllm/v1/executor/multiproc_executor.py", line 243, in get_response
  704. (EngineCore_DP0 pid=511959) raise RuntimeError(
  705. (EngineCore_DP0 pid=511959) RuntimeError: Worker failed with error 'CUDA error: an illegal memory access was encountered
  706. (EngineCore_DP0 pid=511959) CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
  707. (EngineCore_DP0 pid=511959) For debugging consider passing CUDA_LAUNCH_BLOCKING=1
  708. (EngineCore_DP0 pid=511959) Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
  709. (EngineCore_DP0 pid=511959) ', please check the stack trace above for the root cause
  710. (APIServer pid=511717) Traceback (most recent call last):
  711. (APIServer pid=511717) File "/home/ubuntuai/vllm_source/.venv/bin/vllm", line 8, in <module>
  712. (APIServer pid=511717) sys.exit(main())
  713. (APIServer pid=511717) ^^^^^^
  714. (APIServer pid=511717) File "/home/ubuntuai/vllm_source/vllm/entrypoints/cli/main.py", line 54, in main
  715. (APIServer pid=511717) args.dispatch_function(args)
  716. (APIServer pid=511717) File "/home/ubuntuai/vllm_source/vllm/entrypoints/cli/serve.py", line 50, in cmd
  717. (APIServer pid=511717) uvloop.run(run_server(args))
  718. (APIServer pid=511717) File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/uvloop/__init__.py", line 109, in run
  719. (APIServer pid=511717) return __asyncio.run(
  720. (APIServer pid=511717) ^^^^^^^^^^^^^^
  721. (APIServer pid=511717) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
  722. (APIServer pid=511717) return runner.run(main)
  723. (APIServer pid=511717) ^^^^^^^^^^^^^^^^
  724. (APIServer pid=511717) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
  725. (APIServer pid=511717) return self._loop.run_until_complete(task)
  726. (APIServer pid=511717) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  727. (APIServer pid=511717) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
  728. (APIServer pid=511717) File "/home/ubuntuai/vllm_source/.venv/lib/python3.12/site-packages/uvloop/__init__.py", line 61, in wrapper
  729. (APIServer pid=511717) return await main
  730. (APIServer pid=511717) ^^^^^^^^^^
  731. (APIServer pid=511717) File "/home/ubuntuai/vllm_source/vllm/entrypoints/openai/api_server.py", line 1941, in run_server
  732. (APIServer pid=511717) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
  733. (APIServer pid=511717) File "/home/ubuntuai/vllm_source/vllm/entrypoints/openai/api_server.py", line 1961, in run_server_worker
  734. (APIServer pid=511717) async with build_async_engine_client(
  735. (APIServer pid=511717) ^^^^^^^^^^^^^^^^^^^^^^^^^^
  736. (APIServer pid=511717) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
  737. (APIServer pid=511717) return await anext(self.gen)
  738. (APIServer pid=511717) ^^^^^^^^^^^^^^^^^^^^^
  739. (APIServer pid=511717) File "/home/ubuntuai/vllm_source/vllm/entrypoints/openai/api_server.py", line 179, in build_async_engine_client
  740. (APIServer pid=511717) async with build_async_engine_client_from_engine_args(
  741. (APIServer pid=511717) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  742. (APIServer pid=511717) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
  743. (APIServer pid=511717) return await anext(self.gen)
  744. (APIServer pid=511717) ^^^^^^^^^^^^^^^^^^^^^
  745. (APIServer pid=511717) File "/home/ubuntuai/vllm_source/vllm/entrypoints/openai/api_server.py", line 221, in build_async_engine_client_from_engine_args
  746. (APIServer pid=511717) async_llm = AsyncLLM.from_vllm_config(
  747. (APIServer pid=511717) ^^^^^^^^^^^^^^^^^^^^^^^^^^
  748. (APIServer pid=511717) File "/home/ubuntuai/vllm_source/vllm/utils/__init__.py", line 1587, in inner
  749. (APIServer pid=511717) return fn(*args, **kwargs)
  750. (APIServer pid=511717) ^^^^^^^^^^^^^^^^^^^
  751. (APIServer pid=511717) File "/home/ubuntuai/vllm_source/vllm/v1/engine/async_llm.py", line 205, in from_vllm_config
  752. (APIServer pid=511717) return cls(
  753. (APIServer pid=511717) ^^^^
  754. (APIServer pid=511717) File "/home/ubuntuai/vllm_source/vllm/v1/engine/async_llm.py", line 129, in __init__
  755. (APIServer pid=511717) self.engine_core = EngineCoreClient.make_async_mp_client(
  756. (APIServer pid=511717) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  757. (APIServer pid=511717) File "/home/ubuntuai/vllm_source/vllm/v1/engine/core_client.py", line 102, in make_async_mp_client
  758. (APIServer pid=511717) return AsyncMPClient(*client_args)
  759. (APIServer pid=511717) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  760. (APIServer pid=511717) File "/home/ubuntuai/vllm_source/vllm/v1/engine/core_client.py", line 767, in __init__
  761. (APIServer pid=511717) super().__init__(
  762. (APIServer pid=511717) File "/home/ubuntuai/vllm_source/vllm/v1/engine/core_client.py", line 446, in __init__
  763. (APIServer pid=511717) with launch_core_engines(vllm_config, executor_class,
  764. (APIServer pid=511717) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  765. (APIServer pid=511717) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__
  766. (APIServer pid=511717) next(self.gen)
  767. (APIServer pid=511717) File "/home/ubuntuai/vllm_source/vllm/v1/engine/utils.py", line 729, in launch_core_engines
  768. (APIServer pid=511717) wait_for_engine_startup(
  769. (APIServer pid=511717) File "/home/ubuntuai/vllm_source/vllm/v1/engine/utils.py", line 782, in wait_for_engine_startup
  770. (APIServer pid=511717) raise RuntimeError("Engine core initialization failed. "
  771. (APIServer pid=511717) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
Advertisement
Add Comment
Please, Sign In to add comment