"""Ollama LLM client with retry and response cleaning.""" import json import logging import re import time import urllib.request import urllib.error log = logging.getLogger(__name__) DEFAULT_URL = "http://192.168.0.145:31434" DEFAULT_MODEL = "qwen3-coder:latest" class OllamaUnavailableError(Exception): pass def ollama_available(url: str = DEFAULT_URL) -> bool: """Quick health check — GET /api/tags.""" try: req = urllib.request.Request(f"{url.rstrip('/')}/api/tags") with urllib.request.urlopen(req, timeout=5): return True except Exception: return False _last_call_time = 0.0 MIN_CALL_INTERVAL = 2.0 # seconds between calls to avoid overwhelming Ollama def ollama_generate( prompt: str, model: str = DEFAULT_MODEL, url: str = DEFAULT_URL, max_retries: int = 3, timeout: int = 120, temperature: float = 0.3, num_predict: int = 4000, # Needs headroom for thinking + response with qwen3:32b ) -> str: """Generate text from Ollama with retry + backoff. Returns cleaned response.""" global _last_call_time elapsed = time.time() - _last_call_time if elapsed < MIN_CALL_INTERVAL: time.sleep(MIN_CALL_INTERVAL - elapsed) _last_call_time = time.time() # Use /api/chat which properly separates thinking from content data = json.dumps({ "model": model, "messages": [{"role": "user", "content": prompt}], "stream": False, "options": {"temperature": temperature, "num_predict": num_predict}, }).encode() req = urllib.request.Request( f"{url.rstrip('/')}/api/chat", data=data, headers={"Content-Type": "application/json"}, ) last_error = None for attempt in range(max_retries): try: with urllib.request.urlopen(req, timeout=timeout) as resp: result = json.loads(resp.read()) msg = result.get("message", {}) content = msg.get("content", "").strip() thinking = msg.get("thinking", "").strip() # Content has the actual answer; thinking has the reasoning # If content exists, use it (strip any leaked think tags) if content: return re.sub(r".*?", "", content, flags=re.DOTALL).strip() # If only thinking exists, the model ran out of tokens before answering # Try to extract the answer from the end of the thinking text if thinking: # Look for category/keyword answers in the last 200 chars of thinking return thinking # Fallback to legacy response field raw = result.get("response", "").strip() return re.sub(r".*?", "", raw, flags=re.DOTALL).strip() except (urllib.error.URLError, TimeoutError, OSError) as e: last_error = e if attempt < max_retries - 1: wait = 2 ** attempt log.warning("Ollama attempt %d/%d failed: %s — retrying in %ds", attempt + 1, max_retries, e, wait) time.sleep(wait) raise OllamaUnavailableError(f"Ollama unavailable after {max_retries} attempts: {last_error}")