# vLLM - High-performance LLM inference server # OpenAI-compatible API for running local language models # Port: 8000 # # This configuration runs vLLM in CPU-only mode since seattle doesn't have a GPU. # For better performance, consider using a machine with CUDA-compatible GPU. services: vllm-qwen-1.5b: image: vllm/vllm-openai:latest container_name: vllm-qwen-1.5b ports: - "8000:8000" environment: # Force CPU mode - disable all CUDA detection - CUDA_VISIBLE_DEVICES="" - VLLM_DEVICE=cpu - VLLM_LOGGING_LEVEL=INFO # Prevent CUDA/GPU detection attempts - VLLM_USE_MODELSCOPE=False command: - --model - Qwen/Qwen2.5-1.5B-Instruct - --device - cpu - --max-model-len - "4096" - --dtype - float16 - --trust-remote-code - --host - "0.0.0.0" - --port - "8000" restart: unless-stopped volumes: # Cache model downloads to avoid re-downloading - vllm-cache:/root/.cache/huggingface # Resource limits for CPU mode (adjust based on server capacity) deploy: resources: limits: cpus: '8' memory: 16G reservations: cpus: '4' memory: 8G volumes: vllm-cache: name: vllm-cache