Deploy vLLM with health checks, monitoring, and resilience on AMD Instinct GPUs.
Production-ready configuration with health checks and restart policies:
version: '3.8'
services:
vllm:
image: vllm/vllm-openai-rocm:latest
container_name: vllm-server
ipc: host
devices:
- /dev/kfd
- /dev/dri
group_add:
- video
security_opt:
- seccomp:unconfined
cap_add:
- SYS_PTRACE
ports:
- "8000:8000"
environment:
- HF_TOKEN=${HF_TOKEN}
- VLLM_ROCM_USE_AITER=1
- AITER_ENABLE_VSKIP=0
volumes:
- ~/.cache/huggingface:/root/.cache/huggingface
command: >
--model deepseek-ai/DeepSeek-V3.2
--tensor-parallel-size 8
--quantization fp8
--block-size 1
--host 0.0.0.0
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 600s
| Setting | Value | Purpose |
|---|---|---|
start_period |
600s | Allow time for large model loading (~5-10 min for 685B) |
restart |
unless-stopped | Auto-restart on crash, respect manual stops |
healthcheck.test |
/health |
vLLM's built-in liveness endpoint |
vLLM exposes these endpoints for health monitoring:
| Endpoint | Purpose | Use For |
|---|---|---|
/health |
Basic liveness check | Liveness probe, load balancer health |
/v1/models |
Lists loaded models | Readiness probe (confirms model loaded) |
# Basic health check
curl http://localhost:8000/health
# Verify model is loaded and ready
curl http://localhost:8000/v1/models
/health endpoint only becomes available after the API server starts (~15-30 seconds after container spawn). For large models, full readiness (model loaded) can take 5-10 minutes.
vLLM exposes Prometheus-compatible metrics at /metrics.
| Metric | Type | Description |
|---|---|---|
vllm:num_requests_running |
Gauge | Requests currently being processed |
vllm:num_requests_waiting |
Gauge | Requests queued for processing |
vllm:gpu_cache_usage_perc |
Gauge | GPU KV cache utilization (0-1) |
vllm:e2e_request_latency_seconds |
Histogram | End-to-end request latency |
vllm:request_prompt_tokens |
Histogram | Input token counts |
vllm:request_generation_tokens |
Histogram | Output token counts |
Add to prometheus.yml:
scrape_configs:
- job_name: 'vllm'
scrape_interval: 15s
static_configs:
- targets: ['vllm-server:8000']
metrics_path: /metrics
curl http://localhost:8000/metrics
Docker Compose with Prometheus and Grafana:
version: '3.8'
services:
vllm:
image: vllm/vllm-openai-rocm:latest
container_name: vllm-server
ipc: host
devices:
- /dev/kfd
- /dev/dri
group_add:
- video
security_opt:
- seccomp:unconfined
cap_add:
- SYS_PTRACE
ports:
- "8000:8000"
environment:
- HF_TOKEN=${HF_TOKEN}
- VLLM_ROCM_USE_AITER=1
volumes:
- ~/.cache/huggingface:/root/.cache/huggingface
command: >
--model meta-llama/Llama-3.1-405B-Instruct
--tensor-parallel-size 8
--quantization fp8
--max-model-len 32768
--host 0.0.0.0
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 600s
prometheus:
image: prom/prometheus:latest
container_name: prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
restart: unless-stopped
grafana:
image: grafana/grafana:latest
container_name: grafana
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
volumes:
- grafana_data:/var/lib/grafana
restart: unless-stopped
volumes:
prometheus_data:
grafana_data:
Create prometheus.yml:
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'vllm'
static_configs:
- targets: ['vllm-server:8000']
metrics_path: /metrics
# Follow logs
docker logs -f vllm-server
# Last 100 lines
docker logs --tail 100 vllm-server
services:
vllm:
# ... other config ...
logging:
driver: json-file
options:
max-size: "100m"
max-file: "3"
vLLM logs include timestamps and log levels. For JSON output, use a log aggregator like Loki or Fluentd to parse the container logs.
vLLM handles SIGTERM for graceful shutdown. Docker Compose sends SIGTERM by default.
services:
vllm:
# ... other config ...
stop_grace_period: 30s
This allows in-flight requests to complete before the container stops.
For multiple vLLM instances, use a reverse proxy:
services:
nginx:
image: nginx:latest
ports:
- "80:80"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf
depends_on:
- vllm-1
- vllm-2
Example nginx.conf:
upstream vllm {
least_conn;
server vllm-1:8000;
server vllm-2:8000;
}
server {
listen 80;
location / {
proxy_pass http://vllm;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_connect_timeout 300s;
proxy_read_timeout 300s;
}
location /health {
proxy_pass http://vllm;
}
}
Example Prometheus alerting rules:
groups:
- name: vllm
rules:
- alert: VLLMHighLatency
expr: histogram_quantile(0.95, rate(vllm:e2e_request_latency_seconds_bucket[5m])) > 30
for: 5m
labels:
severity: warning
annotations:
summary: "High p95 latency on vLLM"
- alert: VLLMHighQueueDepth
expr: vllm:num_requests_waiting > 100
for: 2m
labels:
severity: warning
annotations:
summary: "High queue depth on vLLM"
- alert: VLLMDown
expr: up{job="vllm"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "vLLM server is down"
docker-compose.yml with the configuration aboveprometheus.yml for metrics scrapingexport HF_TOKEN=your_tokendocker compose up -d