Sanitized mirror from private repository - 2026-04-05 11:10:41 UTC

2026-04-05 11:10:41 +00:00
commit 8882a5948e
1394 changed files with 356828 additions and 0 deletions
--- a/scripts/stack-restart.py
+++ b/scripts/stack-restart.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+"""Auto-restart unhealthy containers with LLM safety analysis.
+
+Monitors all Portainer endpoints for unhealthy containers. If a container
+has been unhealthy for >5 minutes and passes safety checks (blacklist,
+rate limit, LLM analysis), it gets restarted automatically.
+
+SQLite DB tracks unhealthy state durations and restart counts.
+
+Cron: */5 * * * * /usr/bin/python3 /home/homelab/organized/repos/homelab/scripts/stack-restart.py
+"""
+
+import argparse
+import logging
+import sqlite3
+import sys
+from datetime import datetime, timedelta
+from pathlib import Path
+from zoneinfo import ZoneInfo
+
+sys.path.insert(0, str(Path(__file__).parent))
+from lib.ollama import ollama_generate, ollama_available, OllamaUnavailableError
+from lib.notify import send_email, send_ntfy
+from lib.portainer import list_containers, get_container_logs, restart_container, ENDPOINTS
+
+# ── config ───────────────────────────────────────────────────────────────────
+
+DB_PATH = Path(__file__).parent / "stack-restart.db"
+TIMEZONE = ZoneInfo("America/Los_Angeles")
+NTFY_TOPIC = "REDACTED_NTFY_TOPIC"
+UNHEALTHY_GRACE_SECONDS = 300  # 5 minutes
+MAX_RESTARTS_PER_HOUR = 2
+LOG_TAIL_LINES = 50
+
+RESTART_BLACKLIST = [
+    "portainer",
+    "headscale",
+    "adguard",
+    "postgres",
+    "mariadb",
+    "mongodb",
+    "redis",
+]
+
+log = logging.getLogger("stack-restart")
+
+
+# ── database ─────────────────────────────────────────────────────────────────
+
+def init_db(db_path: Path) -> sqlite3.Connection:
+    """Initialize SQLite database and return connection."""
+    conn = sqlite3.connect(str(db_path))
+    conn.row_factory = sqlite3.Row
+    conn.execute("""
+        CREATE TABLE IF NOT EXISTS unhealthy_tracking (
+            container_id TEXT,
+            endpoint TEXT,
+            first_seen TEXT,
+            last_checked TEXT,
+            restart_count INTEGER DEFAULT 0,
+            last_restart TEXT,
+            PRIMARY KEY (container_id, endpoint)
+        )
+    """)
+    conn.commit()
+    return conn
+
+
+def get_tracking(conn: sqlite3.Connection, container_id: str, endpoint: str) -> dict | None:
+    """Get tracking record for a container."""
+    row = conn.execute(
+        "SELECT * FROM unhealthy_tracking WHERE container_id = ? AND endpoint = ?",
+        (container_id, endpoint),
+    ).fetchone()
+    return dict(row) if row else None
+
+
+def upsert_tracking(conn: sqlite3.Connection, container_id: str, endpoint: str,
+                    first_seen: str, last_checked: str, restart_count: int = 0,
+                    last_restart: str | None = None):
+    """Insert or update tracking record."""
+    conn.execute("""
+        INSERT INTO unhealthy_tracking (container_id, endpoint, first_seen, last_checked, restart_count, last_restart)
+        VALUES (?, ?, ?, ?, ?, ?)
+        ON CONFLICT(container_id, endpoint)
+        DO UPDATE SET last_checked = ?, restart_count = ?, last_restart = COALESCE(?, last_restart)
+    """, (container_id, endpoint, first_seen, last_checked, restart_count, last_restart,
+          last_checked, restart_count, last_restart))
+    conn.commit()
+
+
+def remove_tracking(conn: sqlite3.Connection, container_id: str, endpoint: str):
+    """Remove tracking for a container that is now healthy."""
+    conn.execute(
+        "DELETE FROM unhealthy_tracking WHERE container_id = ? AND endpoint = ?",
+        (container_id, endpoint),
+    )
+    conn.commit()
+
+
+def cleanup_stale(conn: sqlite3.Connection, active_ids: set[tuple[str, str]]):
+    """Remove tracking entries for containers no longer unhealthy."""
+    rows = conn.execute("SELECT container_id, endpoint FROM unhealthy_tracking").fetchall()
+    for row in rows:
+        key = (row["container_id"], row["endpoint"])
+        if key not in active_ids:
+            log.info("Cleaning up healthy container: %s on %s", key[0][:12], key[1])
+            remove_tracking(conn, key[0], key[1])
+
+
+# ── container helpers ────────────────────────────────────────────────────────
+
+def get_container_name(container: dict) -> str:
+    """Extract human-readable container name."""
+    names = container.get("Names", [])
+    if names:
+        return names[0].lstrip("/")
+    return container.get("Id", "unknown")[:12]
+
+
+def is_blacklisted(name: str) -> bool:
+    """Check if container name matches any blacklist entry."""
+    name_lower = name.lower()
+    return any(bl in name_lower for bl in RESTART_BLACKLIST)
+
+
+def get_health_status(container: dict) -> str | None:
+    """Extract health status from container state."""
+    state = container.get("State", "")
+    # Docker API JSON list format: State is a string, Status has health info
+    status = container.get("Status", "")
+    if "(unhealthy)" in status.lower():
+        return "unhealthy"
+    # Some API versions nest it
+    if isinstance(state, dict):
+        health = state.get("Health", {})
+        if isinstance(health, dict):
+            return health.get("Status")
+    return None
+
+
+def restarts_in_last_hour(tracking: dict) -> int:
+    """Count how many restarts happened in the last hour based on tracking."""
+    if not tracking or not tracking.get("last_restart"):
+        return 0
+    last = datetime.fromisoformat(tracking["last_restart"])
+    now = datetime.now(TIMEZONE)
+    if last.tzinfo is None:
+        last = last.replace(tzinfo=TIMEZONE)
+    if (now - last) < timedelta(hours=1):
+        return tracking.get("restart_count", 0)
+    return 0
+
+
+# ── main logic ───────────────────────────────────────────────────────────────
+
+def process_unhealthy(conn: sqlite3.Connection, endpoint: str, container: dict,
+                      dry_run: bool, use_ollama: bool):
+    """Process a single unhealthy container."""
+    cid = container.get("Id", "")
+    name = get_container_name(container)
+    now_str = datetime.now(TIMEZONE).isoformat()
+
+    log.info("Unhealthy container: %s (%s) on %s", name, cid[:12], endpoint)
+
+    # Check/insert tracking
+    tracking = get_tracking(conn, cid, endpoint)
+    if not tracking:
+        log.info("First seen, starting grace period: %s", name)
+        upsert_tracking(conn, cid, endpoint, first_seen=now_str, last_checked=now_str)
+        return
+
+    # Update last_checked
+    first_seen = datetime.fromisoformat(tracking["first_seen"])
+    if first_seen.tzinfo is None:
+        first_seen = first_seen.replace(tzinfo=TIMEZONE)
+    now = datetime.now(TIMEZONE)
+    duration = (now - first_seen).total_seconds()
+
+    upsert_tracking(conn, cid, endpoint,
+                    first_seen=tracking["first_seen"],
+                    last_checked=now_str,
+                    restart_count=tracking.get("restart_count", 0),
+                    last_restart=tracking.get("last_restart"))
+
+    if duration < UNHEALTHY_GRACE_SECONDS:
+        remaining = UNHEALTHY_GRACE_SECONDS - duration
+        log.info("Still in grace period (%ds remaining): %s", remaining, name)
+        return
+
+    log.info("Container %s unhealthy for %ds (past grace period)", name, duration)
+
+    # Check blacklist
+    if is_blacklisted(name):
+        log.warning("Container %s is blacklisted, notifying only", name)
+        msg = f"Unhealthy (blacklisted, no auto-restart): {name} on {endpoint} ({duration:.0f}s)"
+        if not dry_run:
+            send_email(subject=f"[Homelab] Unhealthy: {name}", text_body=msg)
+        else:
+            log.info("[DRY-RUN] Would notify: %s", msg)
+        return
+
+    # Check restart rate limit
+    recent_restarts = restarts_in_last_hour(tracking)
+    if recent_restarts >= MAX_RESTARTS_PER_HOUR:
+        log.warning("Container %s hit restart limit (%d/%d in last hour)", name, recent_restarts, MAX_RESTARTS_PER_HOUR)
+        msg = f"Unhealthy (restart limit reached): {name} on {endpoint} — {recent_restarts} restarts in last hour"
+        if not dry_run:
+            send_email(subject=f"[Homelab] Restart limit: {name}", text_body=msg)
+        else:
+            log.info("[DRY-RUN] Would notify: %s", msg)
+        return
+
+    # Get logs for analysis
+    try:
+        logs = get_container_logs(endpoint, cid, tail=LOG_TAIL_LINES)
+    except Exception as e:
+        log.error("Failed to get logs for %s: %s", name, e)
+        logs = f"(failed to fetch logs: {e})"
+
+    # LLM analysis or raw notification
+    if use_ollama:
+        prompt = (
+            "Analyze these container logs. Is it safe to restart this container? "
+            "Consider: data corruption risk, dependency chains, active operations. "
+            "Reply with SAFE or UNSAFE followed by your reasoning.\n\n"
+            f"Container: {name}\n"
+            f"Endpoint: {endpoint}\n"
+            f"Unhealthy duration: {duration:.0f}s\n\n"
+            f"Logs:\n```\n{logs}\n```"
+        )
+        try:
+            llm_response = ollama_generate(prompt)
+            log.info("LLM response for %s: %s", name, llm_response[:200])
+
+            first_line = llm_response.strip().split("\n")[0].lower()
+            is_safe = "safe" in first_line and "unsafe" not in first_line
+
+            if is_safe:
+                log.info("LLM says SAFE to restart %s", name)
+                if not dry_run:
+                    success = restart_container(endpoint, cid)
+                    new_count = tracking.get("restart_count", 0) + 1
+                    upsert_tracking(conn, cid, endpoint,
+                                    first_seen=tracking["first_seen"],
+                                    last_checked=now_str,
+                                    restart_count=new_count,
+                                    last_restart=now_str)
+                    status = "restarted" if success else "restart FAILED"
+                    msg = f"Container {name} on {endpoint} {status} (LLM: SAFE)\n\n{llm_response}"
+                    send_email(subject=f"[Homelab] Restarted: {name}", text_body=msg)
+                else:
+                    log.info("[DRY-RUN] Would restart: %s", name)
+            else:
+                log.warning("LLM says UNSAFE for %s", name)
+                msg = f"Unhealthy but LLM says UNSAFE: {name} on {endpoint}\n\n{llm_response}"
+                if not dry_run:
+                    send_email(subject=f"[Homelab] Unsafe to restart: {name}", text_body=msg)
+                else:
+                    log.info("[DRY-RUN] Would notify (UNSAFE): %s", name)
+
+        except OllamaUnavailableError as e:
+            log.warning("Ollama failed during analysis: %s", e)
+            msg = f"Unhealthy (LLM unavailable, no auto-restart): {name} on {endpoint}\n\nLast {LOG_TAIL_LINES} log lines:\n{logs}"
+            if not dry_run:
+                send_ntfy(topic=NTFY_TOPIC, title=f"Unhealthy: {name}", message=msg[:512], priority="high")
+    else:
+        # No Ollama — notify only, never restart
+        log.info("Ollama unavailable, notifying only for %s", name)
+        msg = f"Unhealthy (no LLM, no auto-restart): {name} on {endpoint} ({duration:.0f}s)\n\nLast {LOG_TAIL_LINES} log lines:\n{logs}"
+        if not dry_run:
+            send_ntfy(topic=NTFY_TOPIC, title=f"Unhealthy: {name}", message=msg[:512], priority="high")
+        else:
+            log.info("[DRY-RUN] Would notify: %s", msg[:200])
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Auto-restart unhealthy containers with LLM safety check")
+    parser.add_argument("--dry-run", action="store_true", help="Log actions without restarting or notifying")
+    parser.add_argument("--verbose", action="store_true", help="Enable debug logging")
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(asctime)s %(name)s %(levelname)s %(message)s",
+    )
+
+    log.info("Starting stack-restart check")
+    conn = init_db(DB_PATH)
+
+    use_ollama = ollama_available()
+    if use_ollama:
+        log.info("Ollama is available for safety analysis")
+    else:
+        log.info("Ollama is NOT available — will notify only, no auto-restarts")
+
+    # Track all currently unhealthy containers for cleanup
+    active_unhealthy: set[tuple[str, str]] = set()
+
+    for endpoint in ENDPOINTS:
+        log.info("Checking endpoint: %s", endpoint)
+        try:
+            containers = list_containers(endpoint, all_containers=True)
+        except Exception as e:
+            log.error("Failed to list containers on %s: %s", endpoint, e)
+            continue
+
+        for container in containers:
+            health = get_health_status(container)
+            cid = container.get("Id", "")
+
+            if health == "unhealthy":
+                active_unhealthy.add((cid, endpoint))
+                try:
+                    process_unhealthy(conn, endpoint, container, args.dry_run, use_ollama)
+                except Exception as e:
+                    name = get_container_name(container)
+                    log.error("Error processing %s on %s: %s", name, endpoint, e)
+
+    # Clean up containers that recovered
+    cleanup_stale(conn, active_unhealthy)
+
+    conn.close()
+    log.info("Stack-restart check complete")
+
+
+if __name__ == "__main__":
+    main()