Metax C500 8卡部署Qwen3.6-35B-A3B模型,容器启动命令如下:
docker run -itd \
--name qwen3.6 \
--network host \
--shm-size 512G \
--device=/dev/dri \
--device=/dev/mxcd \
--group-add video \
--security-opt seccomp=unconfined \
--security-opt apparmor=unconfined \
--shm-size 100gb \
--ulimit memlock=-1 \
-v /home/modelscope:/root/vllm \
-e TZ=Asia/Shanghai \
-p 8000:8000 \
-p 8001:8001 \
-p 8002:8002 \
cr.metax-tech.com/public-ai-release/maca/vllm-metax:0.19.0-maca.ai3.5.3.502-torch2.8-py312-kylinv11-amd64
vllm启动命令如下:
vllm serve /root/vllm/Qwen/Qwen3.6-35B-A3B/ -tp 8\
--host 0.0.0.0 \
--port 8000 \
--served-model-name qwen3.6 \
--dtype bfloat16 \
--trust-remote-code \
--tensor-parallel-size 8 \
--distributed-executor-backend mp \
--gpu-memory-utilization 0.8 \
--max-model-len 32768 \
--max-num-batched-tokens 524288 \
--kv-cache-dtype fp8_e4m3
报错信息如下:
(EngineCore pid=157812) ERROR 05-20 16:37:27 [core.py:1108] RuntimeError: Worker failed with error 'CUDA out of memory. Tried to allocate 32.00 GiB. GPU 0 has a total capacity of 63.59 GiB of which 22.74
GiB is free. Of the allocated memory 35.24 GiB is allocated by PyTorch, and 442.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_
CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (pytorch.org/docs/stable/notes/cuda.html#environment-variables)', please check the stack trace abov
e for the root cause
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] return self._call_impl(args, kwargs)
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] File "/opt/conda/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] return forward_call(args, kwargs)
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] File "<eval_with_key>.82", line 258, in forward
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] submod_2 = self.submod_2(getitem_3, s59, getitem_4, l_self_modules_layers_modules_0_modules_linear_attn_modules_norm_parameter
s_weight_, getitem_5, l_self_modules_layers_modules_0_modules_linear_attn_modules_out_proj_parameters_weight_, getitem_6, s18, l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_
weight_, l_inputs_embeds_, l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_1_modules_linear_attn_modules_in_proj_qkvz_parameters_weight_, l_self_
modules_layers_modules_1_modules_linear_attn_modules_in_proj_ba_parameters_weight_); getitem_3 = getitem_4 = l_self_modules_layers_modules_0_modules_linear_attn_modules_norm_parameters_weight_ = getitem
5 = l_self_modules_layers_modules_0_modules_linear_attn_modules_out_proj_parameters_weight = getitem_6 = l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = l_inputs_e
mbeds_ = l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_1_modules_linear_attn_modules_in_proj_qkvz_parameters_weight_ = l_self_modules_layers_m
odules_1_modules_linear_attn_modules_in_proj_ba_parameters_weight_ = None
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] File "/opt/conda/lib/python3.12/site-packages/vllm/compilation/cuda_graph.py", line 254, in call
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] return self.runnable(*args, kwargs)
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] File "/opt/conda/lib/python3.12/site-packages/vllm/compilation/piecewise_backend.py", line 367, in call
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] return range_entry.runnable(args)
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] File "/opt/conda/lib/python3.12/site-packages/torch/_inductor/standalone_compile.py", line 62, in call
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] return self._compiled_fn(args)
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] File "/opt/conda/lib/python3.12/site-packages/torch/_dynamo/eval_frame.py", line 929, in _fn
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] return fn(args, kwargs)
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] ^^^^^^^^^^^^^^^^^^^
(Worker_TP5 pid=158165) ERROR 05-20 16:37:27 [multiproc_executor.py:949] File "/opt/conda/lib/python3.12/site-packages/torch/_functorch/aot_autograd.py", line 1241, in forward
(Worker_TP5 pid=158165) WARNING 05-20 16:37:27 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP4 pid=158164) WARNING 05-20 16:37:27 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP0 pid=158160) WARNING 05-20 16:37:27 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP2 pid=158162) WARNING 05-20 16:37:27 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP6 pid=158166) WARNING 05-20 16:37:27 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP1 pid=158161) WARNING 05-20 16:37:27 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP7 pid=158167) WARNING 05-20 16:37:27 [multiproc_executor.py:871] WorkerProc was terminated
(Worker_TP3 pid=158163) WARNING 05-20 16:37:27 [multiproc_executor.py:871] WorkerProc was terminated
(EngineCore pid=157812) ERROR 05-20 16:37:38 [multiproc_executor.py:273] Worker proc VllmWorker-4 died unexpectedly, shutting down executor.
(EngineCore pid=157812) Process EngineCore:
(EngineCore pid=157812) Traceback (most recent call last):
(EngineCore pid=157812) File "/opt/conda/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
(EngineCore pid=157812) self.run()
(EngineCore pid=157812) File "/opt/conda/lib/python3.12/multiprocessing/process.py", line 108, in run
(EngineCore pid=157812) self._target(self._args, self._kwargs)
(EngineCore pid=157812) File "/opt/conda/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 1112, in run_engine_core
(EngineCore pid=157812) raise e
(EngineCore pid=157812) File "/opt/conda/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core
(EngineCore pid=157812) engine_core = EngineCoreProc(*args, engine_index=dp_rank, kwargs)
(EngineCore pid=157812) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=157812) File "/opt/conda/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore pid=157812) return func(args, kwargs)
(EngineCore pid=157812) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=157812) File "/opt/conda/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 848, in init
(EngineCore pid=157812) super().init(
(EngineCore pid=157812) File "/opt/conda/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 124, in init
(EngineCore pid=157812) kv_cache_config = self._initialize_kv_caches(vllm_config)
(EngineCore pid=157812) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=157812) File "/opt/conda/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(EngineCore pid=157812) return func(args, kwargs)
(EngineCore pid=157812) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=157812) File "/opt/conda/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 247, in _initialize_kv_caches
(EngineCore pid=157812) available_gpu_memory = self.model_executor.determine_available_memory()
(EngineCore pid=157812) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=157812) File "/opt/conda/lib/python3.12/site-packages/vllm/v1/executor/abstract.py", line 136, in determine_available_memory
(EngineCore pid=157812) return self.collective_rpc("determine_available_memory")
(EngineCore pid=157812) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore pid=157812) File "/opt/conda/lib/python3.12/site-packages/vllm/v1/executor/multiproc_executor.py", line 397, in collective_rpc
(EngineCore pid=157812) return aggregate(get_response())
(EngineCore pid=157812) ^^^^^^^^^^^^^^
(EngineCore pid=157812) File "/opt/conda/lib/python3.12/site-packages/vllm/v1/executor/multiproc_executor.py", line 380, in get_response
(EngineCore pid=157812) raise RuntimeError(
(EngineCore pid=157812) RuntimeError: Worker failed with error 'CUDA out of memory. Tried to allocate 32.00 GiB. GPU 0 has a total capacity of 63.59 GiB of which 22.74 GiB is free. Of the allocated memor
y 35.24 GiB is allocated by PyTorch, and 442.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avo
id fragmentation. See documentation for Memory Management (pytorch.org/docs/stable/notes/cuda.html#environment-variables)', please check the stack trace above for the root cause
(APIServer pid=157458) Traceback (most recent call last):
(APIServer pid=157458) File "/opt/conda/bin/vllm", line 8, in <module>
(APIServer pid=157458) sys.exit(main())
(APIServer pid=157458) ^^^^^^
(APIServer pid=157458) File "/opt/conda/lib/python3.12/site-packages/vllm/entrypoints/cli/main.py", line 75, in main
(APIServer pid=157458) args.dispatch_function(args)
(APIServer pid=157458) File "/opt/conda/lib/python3.12/site-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd
(APIServer pid=157458) uvloop.run(run_server(args))
(APIServer pid=157458) File "/opt/conda/lib/python3.12/site-packages/uvloop/init.py", line 96, in run
(APIServer pid=157458) return asyncio.run(
(APIServer pid=157458) ^^^^^^^^^^^^^^
(APIServer pid=157458) File "/opt/conda/lib/python3.12/asyncio/runners.py", line 195, in run
(APIServer pid=157458) return runner.run(main)
(APIServer pid=157458) ^^^^^^^^^^^^^^^^
(APIServer pid=157458) File "/opt/conda/lib/python3.12/asyncio/runners.py", line 118, in run
(APIServer pid=157458) return self._loop.run_until_complete(task)
(APIServer pid=157458) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=157458) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
(APIServer pid=157458) File "/opt/conda/lib/python3.12/site-packages/uvloop/__init.py", line 48, in wrapper
(APIServer pid=157458) return await main
(APIServer pid=157458) ^^^^^^^^^^
(APIServer pid=157458) File "/opt/conda/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server
(APIServer pid=157458) await run_server_worker(listen_address, sock, args, uvicorn_kwargs)
(APIServer pid=157458) File "/opt/conda/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker
(APIServer pid=157458) async with build_async_engine_client(
(APIServer pid=157458) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=157458) File "/opt/conda/lib/python3.12/contextlib.py", line 210, in aenter
(APIServer pid=157458) return await anext(self.gen)
(APIServer pid=157458) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=157458) File "/opt/conda/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client
(APIServer pid=157458) async with build_async_engine_client_from_engine_args(
(APIServer pid=157458) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=157458) File "/opt/conda/lib/python3.12/contextlib.py", line 210, in aenter
(APIServer pid=157458) return await anext(self.gen)
(APIServer pid=157458) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=157458) File "/opt/conda/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client_from_engine_args
(APIServer pid=157458) async_llm = AsyncLLM.from_vllm_config(
(APIServer pid=157458) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=157458) File "/opt/conda/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 225, in from_vllm_config
(APIServer pid=157458) return cls(
(APIServer pid=157458) ^^^^
(APIServer pid=157458) File "/opt/conda/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 154, in init
(APIServer pid=157458) self.engine_core = EngineCoreClient.make_async_mp_client(
(APIServer pid=157458) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=157458) File "/opt/conda/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(APIServer pid=157458) return func(args, kwargs)
(APIServer pid=157458) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=157458) File "/opt/conda/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client
(APIServer pid=157458) return AsyncMPClient(client_args)
(APIServer pid=157458) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=157458) File "/opt/conda/lib/python3.12/site-packages/vllm/tracing/otel.py", line 178, in sync_wrapper
(APIServer pid=157458) return func(args, *kwargs)
(APIServer pid=157458) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=157458) File "/opt/conda/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 887, in init
(APIServer pid=157458) super().init(
(APIServer pid=157458) File "/opt/conda/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 535, in init
(APIServer pid=157458) with launch_core_engines(
(APIServer pid=157458) ^^^^^^^^^^^^^^^^^^^^
(APIServer pid=157458) File "/opt/conda/lib/python3.12/contextlib.py", line 144, in exit
(APIServer pid=157458) next(self.gen)
(APIServer pid=157458) File "/opt/conda/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines
(APIServer pid=157458) wait_for_engine_startup(
(APIServer pid=157458) File "/opt/conda/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup
(APIServer pid=157458) raise RuntimeError(
(APIServer pid=157458) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
/opt/conda/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 8 leaked shared_memory objects to clean up at shutdown
warnings.warn('resource_tracker: There appear to be %d '