# vLLM - High-performance LLM inference server
# OpenAI-compatible API for running local language models
# Port: 8000
#
# This configuration runs vLLM in CPU-only mode since seattle doesn't have a GPU.
# For better performance, consider using a machine with CUDA-compatible GPU.

services:
  vllm-qwen-1.5b:
    image: vllm/vllm-openai:latest
    container_name: vllm-qwen-1.5b
    ports:
      - "8000:8000"
    environment:
      # Force CPU mode - disable all CUDA detection
      - CUDA_VISIBLE_DEVICES=""
      - VLLM_DEVICE=cpu
      - VLLM_LOGGING_LEVEL=INFO
      # Prevent CUDA/GPU detection attempts
      - VLLM_USE_MODELSCOPE=False
    command:
      - --model
      - Qwen/Qwen2.5-1.5B-Instruct
      - --device
      - cpu
      - --max-model-len
      - "4096"
      - --dtype
      - float16
      - --trust-remote-code
      - --host
      - "0.0.0.0"
      - --port
      - "8000"
    restart: unless-stopped
    volumes:
      # Cache model downloads to avoid re-downloading
      - vllm-cache:/root/.cache/huggingface
    # Resource limits for CPU mode (adjust based on server capacity)
    deploy:
      resources:
        limits:
          cpus: '8'
          memory: 16G
        reservations:
          cpus: '4'
          memory: 8G

volumes:
  vllm-cache:
    name: vllm-cache