# Ollama - Local LLM inference server # OpenAI-compatible API for running local language models # Port: 11434 (Ollama API), 8000 (OpenAI-compatible proxy) # # Ollama is much better suited for CPU inference than vLLM. # It provides efficient CPU-based inference with automatic optimization. services: ollama: image: ollama/ollama:latest container_name: ollama-seattle ports: - "11434:11434" environment: - OLLAMA_HOST=0.0.0.0:11434 - OLLAMA_KEEP_ALIVE=24h # CPU-specific optimizations - OLLAMA_NUM_PARALLEL=2 - OLLAMA_MAX_LOADED_MODELS=2 volumes: # Persist model downloads - ollama-data:/root/.ollama restart: unless-stopped deploy: resources: limits: cpus: '12' memory: 32G reservations: cpus: '4' memory: 8G volumes: ollama-data: name: ollama-seattle-data