vllm crash docker - gpt-oss-20b on RTX5090

INFO 08-12 14:36:59 [__init__.py:241] Automatically detected platform cuda.
ESC[1;36m(APIServer pid=1)ESC[0;0m INFO 08-12 14:37:01 [api_server.py:1787] vLLM API server version 0.10.2.dev2+gf5635d62e.d20250807
ESC[1;36m(APIServer pid=1)ESC[0;0m INFO 08-12 14:37:01 [utils.py:326] non-default args: {'model': 'openai/gpt-oss-20b', 'async_scheduling': True}
ESC[1;36m(APIServer pid=1)ESC[0;0m INFO 08-12 14:37:09 [config.py:726] Resolved architecture: GptOssForCausalLM
ESC[1;36m(APIServer pid=1)ESC[0;0m INFO 08-12 14:37:10 [config.py:1759] Using max model len 131072
ESC[1;36m(APIServer pid=1)ESC[0;0m WARNING 08-12 14:37:12 [config.py:1198] mxfp4 quantization is not fully optimized yet. The speed can be slower than non-quantized models.
ESC[1;36m(APIServer pid=1)ESC[0;0m INFO 08-12 14:37:13 [arg_utils.py:1188] Using mp-based distributed executor backend for async scheduling.
ESC[1;36m(APIServer pid=1)ESC[0;0m INFO 08-12 14:37:13 [config.py:2588] Chunked prefill is enabled with max_num_batched_tokens=2048.
ESC[1;36m(APIServer pid=1)ESC[0;0m INFO 08-12 14:37:13 [config.py:244] Overriding cuda graph sizes to [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248
, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024]
INFO 08-12 14:37:18 [__init__.py:241] Automatically detected platform cuda.
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m INFO 08-12 14:37:21 [core.py:654] Waiting for init message from front-end.
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m INFO 08-12 14:37:21 [core.py:73] Initializing a V1 LLM engine (v0.10.2.dev2+gf5635d62e.d20250807) with config: model='openai/gpt-oss-20b', speculative_config=None, tokenizer='openai/gpt-oss-20b', skip_tokenizer_i
nit=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1,
 disable_custom_all_reduce=False, quantization=mxfp4, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False
, reasoning_backend='openai'), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=openai/gpt-oss-20b, num_scheduler_steps=1, multi_step_stre
am_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_att
ention","vllm.unified_attention_with_output","vllm.mamba_mixer2"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph
_capture_sizes":[1024,1008,992,976,960,944,928,912,896,880,864,848,832,816,800,784,768,752,736,720,704,688,672,656,640,624,608,592,576,560,544,528,512,496,480,464,448,432,416,400,384,368,352,336,320,304,288,272,256,248,240,232,224,216,208,200,192,184,1
76,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"max_capture_size":1024,"local_cache_dir":null}
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m              LL          LL          MMM       MMM
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m              LL          LL          MMMM     MMMM
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m          V   LL          LL          MM MM   MM MM
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m vvvv  VVVV   LL          LL          MM  MM MM  MM
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m vvvv VVVV    LL          LL          MM   MMM   MM
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m  vvv VVVV    LL          LL          MM    M    MM
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m   vvVVVV     LL          LL          MM         MM
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m     VVVV     LLLLLLLLLL  LLLLLLLLL   M           M
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m WARNING 08-12 14:37:21 [multiproc_worker_utils.py:273] Reducing Torch parallelism from 12 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m INFO 08-12 14:37:21 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 16777216, 10, 'psm_06e2e7a6'), local_subscribe_addr='ipc:///tmp/5e7288cc-bd69-4452-95a9-13abc615ea70', remote_subscribe_addr=None, remote_addr_ipv6=False)
INFO 08-12 14:37:24 [__init__.py:241] Automatically detected platform cuda.
ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:27 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_1ccdd475'), local_subscribe_addr='ipc:///tmp/9c1253c1-33f9-4b39-8f67-f99e7dbc5f86', remote_subscribe_addr=None, remote_addr_ipv6=False)
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:28 [parallel_state.py:1102] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:28 [topk_topp_sampler.py:49] Using FlashInfer for top-p & top-k sampling.
ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:28 [gpu_model_runner.py:1913] Starting to load model openai/gpt-oss-20b...
ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:28 [gpu_model_runner.py:1945] Loading model from scratch...
ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:28 [cuda.py:286] Using Triton backend on V1 engine.
ESC[1;36m(VllmWorker pid=168)ESC[0;0m WARNING 08-12 14:37:28 [rocm.py:29] Failed to import from amdsmi with ModuleNotFoundError("No module named 'amdsmi'")
ESC[1;36m(VllmWorker pid=168)ESC[0;0m WARNING 08-12 14:37:28 [rocm.py:40] Failed to import from vllm._rocm_C with ModuleNotFoundError("No module named 'vllm._rocm_C'")
ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:28 [triton_attn.py:263] Using vllm unified attention for TritonAttentionImpl
ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:29 [weight_utils.py:296] Using model weights format ['*.safetensors']
ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:32 [default_loader.py:262] Loading weights took 3.18 seconds
ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:32 [mxfp4.py:176] Shuffling MoE weights, it might take a while...
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] WorkerProc failed to start.
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] Traceback (most recent call last):
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 533, in worker_main
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]     worker = WorkerProc(*args, **kwargs)
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 402, in __init__
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]     self.worker.load_model()
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 211, in load_model
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]     self.model_runner.load_model(eep_scale_up=eep_scale_up)
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 1946, in load_model
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]     self.model = model_loader.load_model(
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]                  ^^^^^^^^^^^^^^^^^^^^^^^^
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 50, in load_model
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]     process_weights_after_loading(model, model_config, target_device)
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 112, in process_weights_after_loading
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]     quant_method.process_weights_after_loading(module)
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/mxfp4.py", line 260, in process_weights_after_loading
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]     shuffle_matrix_a(w13_bias[i].clone().reshape(-1, 1),
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]                      ^^^^^^^^^^^^^^^^^^^
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] torch.AcceleratorError: CUDA error: no kernel image is available for execution on the device
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] Search for `cudaErrorNoKernelImageForDevice' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
ESC[1;36m(VllmWorker pid=168)ESC[0;0m ERROR 08-12 14:37:42 [multiproc_executor.py:559]
ESC[1;36m(VllmWorker pid=168)ESC[0;0m INFO 08-12 14:37:42 [multiproc_executor.py:520] Parent process exited, terminating worker
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] EngineCore failed to start.
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] Traceback (most recent call last):
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 709, in run_engine_core
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718]     engine_core = EngineCoreProc(*args, **kwargs)
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718]                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 510, in __init__
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718]     super().__init__(vllm_config, executor_class, log_stats,
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 82, in __init__
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718]     self.model_executor = executor_class(vllm_config)
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718]                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718]   File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 54, in __init__
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718]     self._init_executor()
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 96, in _init_executor
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718]     self.workers = WorkerProc.wait_for_ready(unready_workers)
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718]                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718]   File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 472, in wait_for_ready
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718]     raise e from None
ESC[1;36m(EngineCore_0 pid=114)ESC[0;0m ERROR 08-12 14:37:44 [core.py:718] Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause.