Configure your system for running vLLM on AMD Instinct GPUs.
The simplest approach is using pre-built Docker images.
# Latest stable release (recommended)
docker pull vllm/vllm-openai-rocm:latest
# Or specific ROCm version for reproducibility
docker pull rocm/vllm:rocm7.0.0_vllm_0.11.2_20251210
docker pull rocm/vllm:rocm7.0.0_vllm_0.11.1_20251103
docker run --rm \
--group-add=video \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--device /dev/kfd \
--device /dev/dri \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HF_TOKEN=$HF_TOKEN" \
-p 8000:8000 \
--ipc=host \
vllm/vllm-openai-rocm:latest \
--model MODEL_NAME
| Flag | Purpose |
|---|---|
--device /dev/kfd |
GPU compute interface |
--device /dev/dri |
GPU render interface |
--group-add=video |
GPU access permissions |
--ipc=host |
Shared memory for multi-GPU |
--security-opt seccomp=unconfined |
Required for ROCm |
-v ~/.cache/huggingface:/root/.cache/huggingface |
Cache model downloads |
# Enable AITER optimizations (recommended)
export VLLM_ROCM_USE_AITER=1
# HuggingFace token for gated models
export HF_TOKEN=your_token_here
# For DeepSeek V3.2 (prevents crashes)
export AITER_ENABLE_VSKIP=0
# For Vision-Language models
export VLLM_USE_TRITON_FLASH_ATTN=0
# Increase RCCL channels
export NCCL_MIN_NCHANNELS=112
# High priority RCCL streams
export TORCH_NCCL_HIGH_PRIORITY=1
# Kernel argument optimization
export HIP_FORCE_DEV_KERNARG=1
# Prefer hipBLASLt for GEMM operations
export TORCH_BLAS_PREFER_HIPBLASLT=1
# Check current status
cat /proc/sys/kernel/numa_balancing
# Disable if enabled (returns 1)
sudo sysctl kernel.numa_balancing=0
# Make persistent
echo "kernel.numa_balancing=0" | sudo tee /etc/sysctl.d/99-numa.conf
# Add user to video and render groups
sudo usermod -aG video,render $USER
# Log out and back in for changes to take effect
# Verify ROCm sees all GPUs
rocm-smi --showproductname
# Check memory
rocm-smi --showmeminfo vram
# Start with a small model
docker run --rm \
--device /dev/kfd \
--device /dev/dri \
--group-add=video \
--ipc=host \
-p 8000:8000 \
vllm/vllm-openai-rocm:latest \
--model Qwen/Qwen3-0.6B
# Test the endpoint
curl http://localhost:8000/v1/models
For production deployments:
version: '3.8'
services:
vllm:
image: vllm/vllm-openai-rocm:latest
ipc: host
devices:
- /dev/kfd
- /dev/dri
group_add:
- video
security_opt:
- seccomp:unconfined
cap_add:
- SYS_PTRACE
ports:
- "8000:8000"
environment:
- HF_TOKEN=${HF_TOKEN}
- VLLM_ROCM_USE_AITER=1
volumes:
- ~/.cache/huggingface:/root/.cache/huggingface
command: >
--model deepseek-ai/DeepSeek-V3.2
--tensor-parallel-size 8
--quantization fp8
# Check device permissions
ls -la /dev/kfd /dev/dri
# Add user to groups
sudo usermod -aG video,render $USER
# Verify NUMA balancing is disabled
cat /proc/sys/kernel/numa_balancing
# Should return 0
# Check RCCL communication
RCCL_DEBUG=INFO python -c "import torch.distributed"
# Reduce memory utilization
--gpu-memory-utilization 0.85
# Enable FP8 quantization
--quantization fp8