homelab-optimized/scripts/lib/ollama.py

"""Ollama LLM client with retry and response cleaning."""

import json
import logging
import re
import time
import urllib.request
import urllib.error

log = logging.getLogger(__name__)

DEFAULT_URL = "http://192.168.0.145:31434"
DEFAULT_MODEL = "qwen3-coder:latest"


class OllamaUnavailableError(Exception):
    pass


def ollama_available(url: str = DEFAULT_URL) -> bool:
    """Quick health check — GET /api/tags."""
    try:
        req = urllib.request.Request(f"{url.rstrip('/')}/api/tags")
        with urllib.request.urlopen(req, timeout=5):
            return True
    except Exception:
        return False


_last_call_time = 0.0
MIN_CALL_INTERVAL = 2.0  # seconds between calls to avoid overwhelming Ollama


def ollama_generate(
    prompt: str,
    model: str = DEFAULT_MODEL,
    url: str = DEFAULT_URL,
    max_retries: int = 3,
    timeout: int = 120,
    temperature: float = 0.3,
    num_predict: int = 4000,  # Needs headroom for thinking + response with qwen3:32b
) -> str:
    """Generate text from Ollama with retry + backoff. Returns cleaned response."""
    global _last_call_time
    elapsed = time.time() - _last_call_time
    if elapsed < MIN_CALL_INTERVAL:
        time.sleep(MIN_CALL_INTERVAL - elapsed)
    _last_call_time = time.time()

    # Use /api/chat which properly separates thinking from content
    data = json.dumps({
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "stream": False,
        "options": {"temperature": temperature, "num_predict": num_predict},
    }).encode()
    req = urllib.request.Request(
        f"{url.rstrip('/')}/api/chat",
        data=data,
        headers={"Content-Type": "application/json"},
    )

    last_error = None
    for attempt in range(max_retries):
        try:
            with urllib.request.urlopen(req, timeout=timeout) as resp:
                result = json.loads(resp.read())
                msg = result.get("message", {})
                content = msg.get("content", "").strip()
                thinking = msg.get("thinking", "").strip()
                # Content has the actual answer; thinking has the reasoning
                # If content exists, use it (strip any leaked think tags)
                if content:
                    return re.sub(r"<think>.*?</think>", "", content, flags=re.DOTALL).strip()
                # If only thinking exists, the model ran out of tokens before answering
                # Try to extract the answer from the end of the thinking text
                if thinking:
                    # Look for category/keyword answers in the last 200 chars of thinking
                    return thinking
                # Fallback to legacy response field
                raw = result.get("response", "").strip()
                return re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
        except (urllib.error.URLError, TimeoutError, OSError) as e:
            last_error = e
            if attempt < max_retries - 1:
                wait = 2 ** attempt
                log.warning("Ollama attempt %d/%d failed: %s — retrying in %ds",
                            attempt + 1, max_retries, e, wait)
                time.sleep(wait)

    raise OllamaUnavailableError(f"Ollama unavailable after {max_retries} attempts: {last_error}")