Sanitized mirror from private repository - 2026-04-18 10:57:41 UTC

2026-04-18 10:57:41 +00:00
commit 4c8d376e9b
1418 changed files with 359979 additions and 0 deletions
--- a/scripts/lib/ollama.py
+++ b/scripts/lib/ollama.py
@@ -0,0 +1,91 @@
+"""Ollama LLM client with retry and response cleaning."""
+
+import json
+import logging
+import re
+import time
+import urllib.request
+import urllib.error
+
+log = logging.getLogger(__name__)
+
+DEFAULT_URL = "http://192.168.0.145:31434"
+DEFAULT_MODEL = "qwen3-coder:latest"
+
+
+class OllamaUnavailableError(Exception):
+    pass
+
+
+def ollama_available(url: str = DEFAULT_URL) -> bool:
+    """Quick health check — GET /api/tags."""
+    try:
+        req = urllib.request.Request(f"{url.rstrip('/')}/api/tags")
+        with urllib.request.urlopen(req, timeout=5):
+            return True
+    except Exception:
+        return False
+
+
+_last_call_time = 0.0
+MIN_CALL_INTERVAL = 2.0  # seconds between calls to avoid overwhelming Ollama
+
+
+def ollama_generate(
+    prompt: str,
+    model: str = DEFAULT_MODEL,
+    url: str = DEFAULT_URL,
+    max_retries: int = 3,
+    timeout: int = 120,
+    temperature: float = 0.3,
+    num_predict: int = 4000,  # Needs headroom for thinking + response with qwen3:32b
+) -> str:
+    """Generate text from Ollama with retry + backoff. Returns cleaned response."""
+    global _last_call_time
+    elapsed = time.time() - _last_call_time
+    if elapsed < MIN_CALL_INTERVAL:
+        time.sleep(MIN_CALL_INTERVAL - elapsed)
+    _last_call_time = time.time()
+
+    # Use /api/chat which properly separates thinking from content
+    data = json.dumps({
+        "model": model,
+        "messages": [{"role": "user", "content": prompt}],
+        "stream": False,
+        "options": {"temperature": temperature, "num_predict": num_predict},
+    }).encode()
+    req = urllib.request.Request(
+        f"{url.rstrip('/')}/api/chat",
+        data=data,
+        headers={"Content-Type": "application/json"},
+    )
+
+    last_error = None
+    for attempt in range(max_retries):
+        try:
+            with urllib.request.urlopen(req, timeout=timeout) as resp:
+                result = json.loads(resp.read())
+                msg = result.get("message", {})
+                content = msg.get("content", "").strip()
+                thinking = msg.get("thinking", "").strip()
+                # Content has the actual answer; thinking has the reasoning
+                # If content exists, use it (strip any leaked think tags)
+                if content:
+                    return re.sub(r"<think>.*?</think>", "", content, flags=re.DOTALL).strip()
+                # If only thinking exists, the model ran out of tokens before answering
+                # Try to extract the answer from the end of the thinking text
+                if thinking:
+                    # Look for category/keyword answers in the last 200 chars of thinking
+                    return thinking
+                # Fallback to legacy response field
+                raw = result.get("response", "").strip()
+                return re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
+        except (urllib.error.URLError, TimeoutError, OSError) as e:
+            last_error = e
+            if attempt < max_retries - 1:
+                wait = 2 ** attempt
+                log.warning("Ollama attempt %d/%d failed: %s — retrying in %ds",
+                            attempt + 1, max_retries, e, wait)
+                time.sleep(wait)
+
+    raise OllamaUnavailableError(f"Ollama unavailable after {max_retries} attempts: {last_error}")