52 lines
1.3 KiB
YAML
52 lines
1.3 KiB
YAML
# vLLM - High-performance LLM inference server
|
|
# OpenAI-compatible API for running local language models
|
|
# Port: 8000
|
|
#
|
|
# This configuration runs vLLM in CPU-only mode since seattle doesn't have a GPU.
|
|
# For better performance, consider using a machine with CUDA-compatible GPU.
|
|
|
|
services:
|
|
vllm-qwen-1.5b:
|
|
image: vllm/vllm-openai:latest
|
|
container_name: vllm-qwen-1.5b
|
|
ports:
|
|
- "8000:8000"
|
|
environment:
|
|
# Force CPU mode - disable all CUDA detection
|
|
- CUDA_VISIBLE_DEVICES=""
|
|
- VLLM_DEVICE=cpu
|
|
- VLLM_LOGGING_LEVEL=INFO
|
|
# Prevent CUDA/GPU detection attempts
|
|
- VLLM_USE_MODELSCOPE=False
|
|
command:
|
|
- --model
|
|
- Qwen/Qwen2.5-1.5B-Instruct
|
|
- --device
|
|
- cpu
|
|
- --max-model-len
|
|
- "4096"
|
|
- --dtype
|
|
- float16
|
|
- --trust-remote-code
|
|
- --host
|
|
- "0.0.0.0"
|
|
- --port
|
|
- "8000"
|
|
restart: unless-stopped
|
|
volumes:
|
|
# Cache model downloads to avoid re-downloading
|
|
- vllm-cache:/root/.cache/huggingface
|
|
# Resource limits for CPU mode (adjust based on server capacity)
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
cpus: '8'
|
|
memory: 16G
|
|
reservations:
|
|
cpus: '4'
|
|
memory: 8G
|
|
|
|
volumes:
|
|
vllm-cache:
|
|
name: vllm-cache
|