Files
homelab-optimized/hosts/vms/seattle/vllm.yaml
Gitea Mirror Bot fb00a325d1
Some checks failed
Documentation / Build Docusaurus (push) Failing after 5m14s
Documentation / Deploy to GitHub Pages (push) Has been skipped
Sanitized mirror from private repository - 2026-04-18 11:19:59 UTC
2026-04-18 11:19:59 +00:00

52 lines
1.3 KiB
YAML

# vLLM - High-performance LLM inference server
# OpenAI-compatible API for running local language models
# Port: 8000
#
# This configuration runs vLLM in CPU-only mode since seattle doesn't have a GPU.
# For better performance, consider using a machine with CUDA-compatible GPU.
services:
vllm-qwen-1.5b:
image: vllm/vllm-openai:latest
container_name: vllm-qwen-1.5b
ports:
- "8000:8000"
environment:
# Force CPU mode - disable all CUDA detection
- CUDA_VISIBLE_DEVICES=""
- VLLM_DEVICE=cpu
- VLLM_LOGGING_LEVEL=INFO
# Prevent CUDA/GPU detection attempts
- VLLM_USE_MODELSCOPE=False
command:
- --model
- Qwen/Qwen2.5-1.5B-Instruct
- --device
- cpu
- --max-model-len
- "4096"
- --dtype
- float16
- --trust-remote-code
- --host
- "0.0.0.0"
- --port
- "8000"
restart: unless-stopped
volumes:
# Cache model downloads to avoid re-downloading
- vllm-cache:/root/.cache/huggingface
# Resource limits for CPU mode (adjust based on server capacity)
deploy:
resources:
limits:
cpus: '8'
memory: 16G
reservations:
cpus: '4'
memory: 8G
volumes:
vllm-cache:
name: vllm-cache