Files
homelab-optimized/scripts/lib/ollama.py
Gitea Mirror Bot 7544b9dd06
Some checks failed
Documentation / Build Docusaurus (push) Failing after 5m4s
Documentation / Deploy to GitHub Pages (push) Has been skipped
Sanitized mirror from private repository - 2026-04-19 09:54:54 UTC
2026-04-19 09:54:54 +00:00

92 lines
3.2 KiB
Python

"""Ollama LLM client with retry and response cleaning."""
import json
import logging
import re
import time
import urllib.request
import urllib.error
log = logging.getLogger(__name__)
DEFAULT_URL = "http://192.168.0.145:31434"
DEFAULT_MODEL = "qwen3-coder:latest"
class OllamaUnavailableError(Exception):
pass
def ollama_available(url: str = DEFAULT_URL) -> bool:
"""Quick health check — GET /api/tags."""
try:
req = urllib.request.Request(f"{url.rstrip('/')}/api/tags")
with urllib.request.urlopen(req, timeout=5):
return True
except Exception:
return False
_last_call_time = 0.0
MIN_CALL_INTERVAL = 2.0 # seconds between calls to avoid overwhelming Ollama
def ollama_generate(
prompt: str,
model: str = DEFAULT_MODEL,
url: str = DEFAULT_URL,
max_retries: int = 3,
timeout: int = 120,
temperature: float = 0.3,
num_predict: int = 4000, # Needs headroom for thinking + response with qwen3:32b
) -> str:
"""Generate text from Ollama with retry + backoff. Returns cleaned response."""
global _last_call_time
elapsed = time.time() - _last_call_time
if elapsed < MIN_CALL_INTERVAL:
time.sleep(MIN_CALL_INTERVAL - elapsed)
_last_call_time = time.time()
# Use /api/chat which properly separates thinking from content
data = json.dumps({
"model": model,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"options": {"temperature": temperature, "num_predict": num_predict},
}).encode()
req = urllib.request.Request(
f"{url.rstrip('/')}/api/chat",
data=data,
headers={"Content-Type": "application/json"},
)
last_error = None
for attempt in range(max_retries):
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
result = json.loads(resp.read())
msg = result.get("message", {})
content = msg.get("content", "").strip()
thinking = msg.get("thinking", "").strip()
# Content has the actual answer; thinking has the reasoning
# If content exists, use it (strip any leaked think tags)
if content:
return re.sub(r"<think>.*?</think>", "", content, flags=re.DOTALL).strip()
# If only thinking exists, the model ran out of tokens before answering
# Try to extract the answer from the end of the thinking text
if thinking:
# Look for category/keyword answers in the last 200 chars of thinking
return thinking
# Fallback to legacy response field
raw = result.get("response", "").strip()
return re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
except (urllib.error.URLError, TimeoutError, OSError) as e:
last_error = e
if attempt < max_retries - 1:
wait = 2 ** attempt
log.warning("Ollama attempt %d/%d failed: %s — retrying in %ds",
attempt + 1, max_retries, e, wait)
time.sleep(wait)
raise OllamaUnavailableError(f"Ollama unavailable after {max_retries} attempts: {last_error}")