Sanitized mirror from private repository - 2026-04-18 10:57:41 UTC
This commit is contained in:
91
scripts/lib/ollama.py
Normal file
91
scripts/lib/ollama.py
Normal file
@@ -0,0 +1,91 @@
|
||||
"""Ollama LLM client with retry and response cleaning."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
DEFAULT_URL = "http://192.168.0.145:31434"
|
||||
DEFAULT_MODEL = "qwen3-coder:latest"
|
||||
|
||||
|
||||
class OllamaUnavailableError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def ollama_available(url: str = DEFAULT_URL) -> bool:
|
||||
"""Quick health check — GET /api/tags."""
|
||||
try:
|
||||
req = urllib.request.Request(f"{url.rstrip('/')}/api/tags")
|
||||
with urllib.request.urlopen(req, timeout=5):
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
_last_call_time = 0.0
|
||||
MIN_CALL_INTERVAL = 2.0 # seconds between calls to avoid overwhelming Ollama
|
||||
|
||||
|
||||
def ollama_generate(
|
||||
prompt: str,
|
||||
model: str = DEFAULT_MODEL,
|
||||
url: str = DEFAULT_URL,
|
||||
max_retries: int = 3,
|
||||
timeout: int = 120,
|
||||
temperature: float = 0.3,
|
||||
num_predict: int = 4000, # Needs headroom for thinking + response with qwen3:32b
|
||||
) -> str:
|
||||
"""Generate text from Ollama with retry + backoff. Returns cleaned response."""
|
||||
global _last_call_time
|
||||
elapsed = time.time() - _last_call_time
|
||||
if elapsed < MIN_CALL_INTERVAL:
|
||||
time.sleep(MIN_CALL_INTERVAL - elapsed)
|
||||
_last_call_time = time.time()
|
||||
|
||||
# Use /api/chat which properly separates thinking from content
|
||||
data = json.dumps({
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False,
|
||||
"options": {"temperature": temperature, "num_predict": num_predict},
|
||||
}).encode()
|
||||
req = urllib.request.Request(
|
||||
f"{url.rstrip('/')}/api/chat",
|
||||
data=data,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
|
||||
last_error = None
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
result = json.loads(resp.read())
|
||||
msg = result.get("message", {})
|
||||
content = msg.get("content", "").strip()
|
||||
thinking = msg.get("thinking", "").strip()
|
||||
# Content has the actual answer; thinking has the reasoning
|
||||
# If content exists, use it (strip any leaked think tags)
|
||||
if content:
|
||||
return re.sub(r"<think>.*?</think>", "", content, flags=re.DOTALL).strip()
|
||||
# If only thinking exists, the model ran out of tokens before answering
|
||||
# Try to extract the answer from the end of the thinking text
|
||||
if thinking:
|
||||
# Look for category/keyword answers in the last 200 chars of thinking
|
||||
return thinking
|
||||
# Fallback to legacy response field
|
||||
raw = result.get("response", "").strip()
|
||||
return re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
|
||||
except (urllib.error.URLError, TimeoutError, OSError) as e:
|
||||
last_error = e
|
||||
if attempt < max_retries - 1:
|
||||
wait = 2 ** attempt
|
||||
log.warning("Ollama attempt %d/%d failed: %s — retrying in %ds",
|
||||
attempt + 1, max_retries, e, wait)
|
||||
time.sleep(wait)
|
||||
|
||||
raise OllamaUnavailableError(f"Ollama unavailable after {max_retries} attempts: {last_error}")
|
||||
Reference in New Issue
Block a user