Sanitized mirror from private repository - 2026-04-05 12:13:06 UTC
This commit is contained in:
51
hosts/vms/seattle/vllm.yaml
Normal file
51
hosts/vms/seattle/vllm.yaml
Normal file
@@ -0,0 +1,51 @@
|
||||
# vLLM - High-performance LLM inference server
|
||||
# OpenAI-compatible API for running local language models
|
||||
# Port: 8000
|
||||
#
|
||||
# This configuration runs vLLM in CPU-only mode since seattle doesn't have a GPU.
|
||||
# For better performance, consider using a machine with CUDA-compatible GPU.
|
||||
|
||||
services:
|
||||
vllm-qwen-1.5b:
|
||||
image: vllm/vllm-openai:latest
|
||||
container_name: vllm-qwen-1.5b
|
||||
ports:
|
||||
- "8000:8000"
|
||||
environment:
|
||||
# Force CPU mode - disable all CUDA detection
|
||||
- CUDA_VISIBLE_DEVICES=""
|
||||
- VLLM_DEVICE=cpu
|
||||
- VLLM_LOGGING_LEVEL=INFO
|
||||
# Prevent CUDA/GPU detection attempts
|
||||
- VLLM_USE_MODELSCOPE=False
|
||||
command:
|
||||
- --model
|
||||
- Qwen/Qwen2.5-1.5B-Instruct
|
||||
- --device
|
||||
- cpu
|
||||
- --max-model-len
|
||||
- "4096"
|
||||
- --dtype
|
||||
- float16
|
||||
- --trust-remote-code
|
||||
- --host
|
||||
- "0.0.0.0"
|
||||
- --port
|
||||
- "8000"
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
# Cache model downloads to avoid re-downloading
|
||||
- vllm-cache:/root/.cache/huggingface
|
||||
# Resource limits for CPU mode (adjust based on server capacity)
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: '8'
|
||||
memory: 16G
|
||||
reservations:
|
||||
cpus: '4'
|
||||
memory: 8G
|
||||
|
||||
volumes:
|
||||
vllm-cache:
|
||||
name: vllm-cache
|
||||
Reference in New Issue
Block a user