想用新版本建议参考此文:Nvidia DGX Spark 集群分别使用vLLM和SGLang部署 Qwen3.5-35B-A3B 技术方案-CSDN博客
用老版本模型,可以使用官方步骤:用于推理的 vLLM | DGX Spark | NVIDIA 开发者
测试推理效率
vllm
经过测试DGX Spark 128GB 用vllm运行Qwen/Qwen3.5-35B-A3B-FP8 token大概50token/s速率,占用105GB内存。推理速度OpenClaw勉强可用。
经过测试DGX Spark 128GB 用vllm运行Qwen/Qwen3.5-122B-A10B-GPTQ-Int4 token大概25token/s速率,占用118GB内存。OpenClaw基本不能用,太慢。
sglang
经过测试DGX Spark 128GB 用sglang运行Qwen/Qwen3.5-35B-A3B-FP8 token大概75token/s速率,占用105GB内存。
经过测试DGX Spark 128GB 用sglang运行Qwen/Qwen3.5-122B-A10B-GPTQ-Int4 token大概25token/s速率,占用118GB内存。
下载镜像:
docker run --gpus all --rm -v ./volumes/iios-vllm/models:/root/.cache/huggingface -e HF_ENDPOINT=https://hf-mirror.com -e HF_HUB_ENABLE_HF_TRANSFER=1 lmsysorg/sglang:dev-arm64-cu13 huggingface-cli download Qwen/Qwen3.5-122B-A10B-GPTQ-Int4
成功部署配置:
vllm
iios-vllm:
container_name: iios-vllm
hostname: iios-vllm
image: vllm/vllm-openai:cu130-nightly-aarch64
ports:
- "8000:8000"
volumes:
- ./volumes/iios-vllm/models:/root/.cache/huggingface:rw
environment:
HF_ENDPOINT: https://hf-mirror.com
HF_HUB_ENABLE_HF_TRANSFER: 1
SAFETENSORS_FAST_GPU: 1
CUDA_DEVICE_MAX_CONNECTIONS: 1
#command:
# - /usr/local/bin/huggingface-cli
# - download
# - Qwen/Qwen3.5-35B-A3B-FP8
command:
- --model=Qwen/Qwen3.5-35B-A3B-FP8
- --max-model-len=32768
- --gpu-memory-utilization=0.85
- --max-num-batched-tokens=4096
- --kv-cache-dtype=fp8
- --enable-prefix-caching
- --enable-auto-tool-choice
- --tool-call-parser=qwen3_xml
- --reasoning-parser=qwen3
- --trust-remote-code
- --enforce-eager
# --disable-log-requests
restart: always
shm_size: 16g
ipc: host
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
logging:
options:
max-size: "10240k"
max-file: "10"
networks:
extnetwork:
ipv4_address: 172.20.0.12
sglang
version: '3.5'
services:
iios-sglang:
container_name: iios-sglang
hostname: iios-sglang
image: lmsysorg/sglang:dev-arm64-cu13
ports:
- "30000:30000"
volumes:
- ./volumes/iios-sglang/models:/root/.cache/huggingface:rw
environment:
HF_ENDPOINT: https://hf-mirror.com
HF_HUB_ENABLE_HF_TRANSFER: 1
#command:
# - /usr/local/bin/huggingface-cli
# - download
# - Qwen/Qwen3.5-35B-A3B-FP8
# /opt/iios-model-docker/volumes/iios-sglang/models/hub/models--Qwen--Qwen3.5-122B-A10B-GPTQ-Int4/snapshots/5b9f0050d3ec98b0c81a7716776533c5eacebb64
command: >
/usr/bin/python3 -m sglang.launch_server
--model-path /root/.cache/huggingface/hub/models--Qwen--Qwen3.5-35B-A3B-FP8/snapshots/0b2752837483aa34b3db6e83e151b150c0e00e49
--host 0.0.0.0
--port 30000
--tp 1
--attention-backend triton
--mem-fraction-static 0.80
--context-length 32768
restart: no
shm_size: 16g
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
logging:
options:
max-size: "10240k"
max-file: "10"
networks:
extnetwork:
ipv4_address: 172.20.0.11version: '3.5'
services:
iios-sglang:
container_name: iios-sglang
hostname: iios-sglang
image: lmsysorg/sglang:dev-arm64-cu13
ports:
- "30000:30000"
volumes:
- ./volumes/iios-sglang/models:/root/.cache/huggingface:rw
environment:
HF_ENDPOINT: https://hf-mirror.com
HF_HUB_ENABLE_HF_TRANSFER: 1
#command:
# - /usr/local/bin/huggingface-cli
# - download
# - Qwen/Qwen3.5-35B-A3B-FP8
# /opt/iios-model-docker/volumes/iios-sglang/models/hub/models--Qwen--Qwen3.5-122B-A10B-GPTQ-Int4/snapshots/5b9f0050d3ec98b0c81a7716776533c5eacebb64
command: >
/usr/bin/python3 -m sglang.launch_server
--model-path /root/.cache/huggingface/hub/models--Qwen--Qwen3.5-35B-A3B-FP8/snapshots/0b2752837483aa34b3db6e83e151b150c0e00e49
--host 0.0.0.0
--port 30000
--tp 1
--attention-backend triton
--mem-fraction-static 0.80
--context-length 32768
restart: no
shm_size: 16g
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
logging:
options:
max-size: "10240k"
max-file: "10"
networks:
extnetwork:
ipv4_address: 172.20.0.11
