Sanitized mirror from private repository - 2026-04-05 12:13:06 UTC

2026-04-05 12:13:06 +00:00
commit de73d60a93
1395 changed files with 358119 additions and 0 deletions
--- a/hosts/vms/seattle/vllm.yaml
+++ b/hosts/vms/seattle/vllm.yaml
@@ -0,0 +1,51 @@
+# vLLM - High-performance LLM inference server
+# OpenAI-compatible API for running local language models
+# Port: 8000
+#
+# This configuration runs vLLM in CPU-only mode since seattle doesn't have a GPU.
+# For better performance, consider using a machine with CUDA-compatible GPU.
+
+services:
+  vllm-qwen-1.5b:
+    image: vllm/vllm-openai:latest
+    container_name: vllm-qwen-1.5b
+    ports:
+      - "8000:8000"
+    environment:
+      # Force CPU mode - disable all CUDA detection
+      - CUDA_VISIBLE_DEVICES=""
+      - VLLM_DEVICE=cpu
+      - VLLM_LOGGING_LEVEL=INFO
+      # Prevent CUDA/GPU detection attempts
+      - VLLM_USE_MODELSCOPE=False
+    command:
+      - --model
+      - Qwen/Qwen2.5-1.5B-Instruct
+      - --device
+      - cpu
+      - --max-model-len
+      - "4096"
+      - --dtype
+      - float16
+      - --trust-remote-code
+      - --host
+      - "0.0.0.0"
+      - --port
+      - "8000"
+    restart: unless-stopped
+    volumes:
+      # Cache model downloads to avoid re-downloading
+      - vllm-cache:/root/.cache/huggingface
+    # Resource limits for CPU mode (adjust based on server capacity)
+    deploy:
+      resources:
+        limits:
+          cpus: '8'
+          memory: 16G
+        reservations:
+          cpus: '4'
+          memory: 8G
+
+volumes:
+  vllm-cache:
+    name: vllm-cache