Guest User

vllm crash docker - gpt-oss-20b on RTX5090

a guest
Aug 12th, 2025
23
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 13.89 KB | None | 0 0
  1. INFO 08-12 14:36:59 [__init__.py:241] Automatically detected platform cuda.
  2. ESC[1;36m(APIServer pid=1)ESC[0;0m INFO 08-12 14:37:01 [api_server.py:1787] vLLM API server version 0.10.2.dev2+gf5635d62e.d20250807
  3. ESC[1;36m(APIServer pid=1)ESC[0;0m INFO 08-12 14:37:01 [utils.py:326] non-default args: {'model': 'openai/gpt-oss-20b', 'async_scheduling': True}
  4. ESC[1;36m(APIServer pid=1)ESC[0;0m INFO 08-12 14:37:09 [config.py:726] Resolved architecture: GptOssForCausalLM
  5. ESC[1;36m(APIServer pid=1)ESC[0;0m INFO 08-12 14:37:10 [config.py:1759] Using max model len 131072
  6. ESC[1;36m(APIServer pid=1)ESC[0;0m WARNING 08-12 14:37:12 [config.py:1198] mxfp4 quantization is not fully optimized yet. The speed can be slower than non-quantized models.
  7. ESC[1;36m(APIServer pid=1)ESC[0;0m INFO 08-12 14:37:13 [arg_utils.py:1188] Using mp-based distributed executor backend for async scheduling.
  8. ESC[1;36m(APIServer pid=1)ESC[0;0m INFO 08-12 14:37:13 [config.py:2588] Chunked prefill is enabled with max_num_batched_tokens=2048.
  9. ESC[1;36m(APIServer pid=1)ESC[0;0m INFO 08-12 14:37:13 [config.py:244] Overriding cuda graph sizes to [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248
  10. , 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024]
  11. INFO 08-12 14:37:18 [__init__.py:241] Automatically detected platform cuda.
  12. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m INFO 08-12 14:37:21 [core.py:654] Waiting for init message from front-end.
  13. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m INFO 08-12 14:37:21 [core.py:73] Initializing a V1 LLM engine (v0.10.2.dev2+gf5635d62e.d20250807) with config: model='openai/gpt-oss-20b', speculative_config=None, tokenizer='openai/gpt-oss-20b', skip_tokenizer_i
  14. nit=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1,
  15. disable_custom_all_reduce=False, quantization=mxfp4, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False
  16. , reasoning_backend='openai'), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=openai/gpt-oss-20b, num_scheduler_steps=1, multi_step_stre
  17. am_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_att
  18. ention","vllm.unified_attention_with_output","vllm.mamba_mixer2"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph
  19. _capture_sizes":[1024,1008,992,976,960,944,928,912,896,880,864,848,832,816,800,784,768,752,736,720,704,688,672,656,640,624,608,592,576,560,544,528,512,496,480,464,448,432,416,400,384,368,352,336,320,304,288,272,256,248,240,232,224,216,208,200,192,184,1
  20. 76,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"max_capture_size":1024,"local_cache_dir":null}
  21. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m
  22. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m LL LL MMM MMM
  23. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m LL LL MMMM MMMM
  24. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m V LL LL MM MM MM MM
  25. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m vvvv VVVV LL LL MM MM MM MM
  26. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m vvvv VVVV LL LL MM MMM MM
  27. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m vvv VVVV LL LL MM M MM
  28. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m vvVVVV LL LL MM MM
  29. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m VVVV LLLLLLLLLL LLLLLLLLL M M
  30. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m
  31. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m WARNING 08-12 14:37:21 [multiproc_worker_utils.py:273] Reducing Torch parallelism from 12 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.
  32. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m INFO 08-12 14:37:21 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 16777216, 10, 'psm_06e2e7a6'), local_subscribe_addr='ipc:///tmp/5e7288cc-bd69-4452-95a9-13abc615ea70', remote_subscribe_addr=None, remote_addr_ipv6=False)
  33. INFO 08-12 14:37:24 [__init__.py:241] Automatically detected platform cuda.
  34. ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:27 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_1ccdd475'), local_subscribe_addr='ipc:///tmp/9c1253c1-33f9-4b39-8f67-f99e7dbc5f86', remote_subscribe_addr=None, remote_addr_ipv6=False)
  35. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  36. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  37. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  38. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  39. [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
  40. ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:28 [parallel_state.py:1102] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
  41. ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:28 [topk_topp_sampler.py:49] Using FlashInfer for top-p & top-k sampling.
  42. ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:28 [gpu_model_runner.py:1913] Starting to load model openai/gpt-oss-20b...
  43. ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:28 [gpu_model_runner.py:1945] Loading model from scratch...
  44. ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:28 [cuda.py:286] Using Triton backend on V1 engine.
  45. ESC[1;36m(VllmWorker pid=168)ESC[0;0m WARNING 08-12 14:37:28 [rocm.py:29] Failed to import from amdsmi with ModuleNotFoundError("No module named 'amdsmi'")
  46. ESC[1;36m(VllmWorker pid=168)ESC[0;0m WARNING 08-12 14:37:28 [rocm.py:40] Failed to import from vllm._rocm_C with ModuleNotFoundError("No module named 'vllm._rocm_C'")
  47. ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:28 [triton_attn.py:263] Using vllm unified attention for TritonAttentionImpl
  48. ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:29 [weight_utils.py:296] Using model weights format ['*.safetensors']
  49. ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:32 [default_loader.py:262] Loading weights took 3.18 seconds
  50. ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:32 [mxfp4.py:176] Shuffling MoE weights, it might take a while...
  51. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] WorkerProc failed to start.
  52. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] Traceback (most recent call last):
  53. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 533, in worker_main
  54. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] worker = WorkerProc(*args, **kwargs)
  55. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  56. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 402, in __init__
  57. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] self.worker.load_model()
  58. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 211, in load_model
  59. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] self.model_runner.load_model(eep_scale_up=eep_scale_up)
  60. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 1946, in load_model
  61. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] self.model = model_loader.load_model(
  62. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] ^^^^^^^^^^^^^^^^^^^^^^^^
  63. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 50, in load_model
  64. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] process_weights_after_loading(model, model_config, target_device)
  65. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 112, in process_weights_after_loading
  66. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] quant_method.process_weights_after_loading(module)
  67. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/mxfp4.py", line 260, in process_weights_after_loading
  68. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] shuffle_matrix_a(w13_bias[i].clone().reshape(-1, 1),
  69. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] ^^^^^^^^^^^^^^^^^^^
  70. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] torch.AcceleratorError: CUDA error: no kernel image is available for execution on the device
  71. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] Search for `cudaErrorNoKernelImageForDevice' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
  72. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
  73. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
  74. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
  75. ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]
  76. ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:42 [multiproc_executor.py:520] Parent process exited, terminating worker
  77. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] EngineCore failed to start.
  78. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] Traceback (most recent call last):
  79. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 709, in run_engine_core
  80. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] engine_core = EngineCoreProc(*args, **kwargs)
  81. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  82. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 510, in __init__
  83. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] super().__init__(vllm_config, executor_class, log_stats,
  84. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 82, in __init__
  85. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] self.model_executor = executor_class(vllm_config)
  86. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  87. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 54, in __init__
  88. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] self._init_executor()
  89. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 96, in _init_executor
  90. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] self.workers = WorkerProc.wait_for_ready(unready_workers)
  91. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  92. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 472, in wait_for_ready
  93. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] raise e from None
  94. ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause.
Advertisement
Add Comment
Please, Sign In to add comment