37 lines
906 B
YAML
37 lines
906 B
YAML
# Ollama - Local LLM inference server
|
|
# OpenAI-compatible API for running local language models
|
|
# Port: 11434 (Ollama API), 8000 (OpenAI-compatible proxy)
|
|
#
|
|
# Ollama is much better suited for CPU inference than vLLM.
|
|
# It provides efficient CPU-based inference with automatic optimization.
|
|
|
|
services:
|
|
ollama:
|
|
image: ollama/ollama:latest
|
|
container_name: ollama-seattle
|
|
ports:
|
|
- "11434:11434"
|
|
environment:
|
|
- OLLAMA_HOST=0.0.0.0:11434
|
|
- OLLAMA_KEEP_ALIVE=24h
|
|
# CPU-specific optimizations
|
|
- OLLAMA_NUM_PARALLEL=2
|
|
- OLLAMA_MAX_LOADED_MODELS=2
|
|
volumes:
|
|
# Persist model downloads
|
|
- ollama-data:/root/.ollama
|
|
restart: unless-stopped
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
cpus: '12'
|
|
memory: 32G
|
|
reservations:
|
|
cpus: '4'
|
|
memory: 8G
|
|
|
|
|
|
volumes:
|
|
ollama-data:
|
|
name: ollama-seattle-data
|