#!/usr/bin/env bash
# fix-derp-connectivity.sh — Diagnose and fix Tailscale DERP relay issues
#
# When Kuma monitors start failing across an entire host group (especially
# Calypso), it's usually because the DERP relay on headscale (Calypso) has
# become stuck. Restarting headscale forces all peers to re-negotiate paths.
#
# Usage:
#   ./fix-derp-connectivity.sh          # diagnose + fix
#   ./fix-derp-connectivity.sh --check  # diagnose only, no restart
#
# Runs from: homelab-vm (where Claude Code runs)

set -uo pipefail

CHECK_ONLY=false
[[ "${1:-}" == "--check" ]] && CHECK_ONLY=true

RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

log()  { echo -e "${GREEN}[OK]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
err()  { echo -e "${RED}[FAIL]${NC} $*"; }

DERP_NAMES=("home-cal" "home-atl" "sea")
DERP_URLS=("https://headscale.vish.gg:8443" "https://derp-atl.vish.gg:8445" "https://derp-sea.vish.gg:8444")
KUMA_HOST="pi-5"

echo "=== DERP Connectivity Check ==="
echo "Time: $(date -Iseconds)"
echo

# 1. Check DERP server reachability
echo "--- DERP Server Reachability ---"
derp_ok=0
derp_fail=0
for i in "${!DERP_NAMES[@]}"; do
    name="${DERP_NAMES[$i]}"
    url="${DERP_URLS[$i]}"
    if curl -sk --connect-timeout 5 -o /dev/null -w '' "$url" 2>/dev/null; then
        log "$name ($url): reachable"
        ((derp_ok++))
    else
        err "$name ($url): UNREACHABLE"
        ((derp_fail++))
    fi
done
echo

# 2. Run netcheck from local machine
echo "--- Local Tailscale Netcheck ---"
netcheck=$(tailscale netcheck 2>&1 || true)
echo "$netcheck" | grep -E 'Nearest DERP|DERP latency' -A5 | head -10
echo

# 3. Check peer connection types
echo "--- Peer Connection Types ---"
tailscale status 2>/dev/null | while read -r ip name rest; do
    if echo "$rest" | grep -q "relay"; then
        relay=$(echo "$rest" | grep -oP 'relay "[^"]+"')
        warn "$name ($ip): $relay"
    elif echo "$rest" | grep -q "direct"; then
        direct=$(echo "$rest" | grep -oP 'direct [0-9.]+:[0-9]+')
        log "$name ($ip): $direct"
    fi
done
echo

# 4. Check Kuma for failing monitors (if reachable)
echo "--- Kuma Monitor Status ---"
kuma_fails=$(ssh -o ConnectTimeout=5 "$KUMA_HOST" \
    "docker logs uptime-kuma --since=5m 2>&1 | grep -c 'WARN.*Failing' || echo 0" 2>/dev/null)
[[ -z "$kuma_fails" ]] && kuma_fails="?"
if [[ "$kuma_fails" == "?" ]]; then
    warn "Could not reach Kuma on $KUMA_HOST"
elif [[ "$kuma_fails" -gt 5 ]]; then
    err "Kuma has $kuma_fails failing monitors in last 5 minutes"
else
    log "Kuma: $kuma_fails failures in last 5 minutes"
fi
echo

# 5. Check headscale container health
echo "--- Headscale Status ---"
hs_status=$(ssh -o ConnectTimeout=5 calypso \
    "sudo /usr/local/bin/docker ps --format '{{.Names}} {{.Status}}' | grep headscale" 2>/dev/null || echo "UNREACHABLE")
echo "  $hs_status"
echo

# 6. Fix if needed
if [[ "$derp_fail" -gt 0 ]] || [[ "$kuma_fails" != "?" && "$kuma_fails" -gt 5 ]]; then
    echo "=== Issues Detected ==="
    if $CHECK_ONLY; then
        warn "Run without --check to apply fixes"
        exit 1
    fi

    echo "Restarting headscale (embedded DERP relay)..."
    ssh -o ConnectTimeout=5 calypso "sudo /usr/local/bin/docker restart headscale" 2>/dev/null
    log "Headscale restarted"

    echo "Waiting 15s for DERP to come back..."
    sleep 15

    # Re-check
    echo
    echo "--- Post-fix Netcheck ---"
    tailscale netcheck 2>&1 | grep -E 'DERP latency' -A5 | head -8

    echo
    echo "--- Post-fix Kuma (waiting 60s for monitor cycle) ---"
    sleep 60
    post_fails=$(ssh -o ConnectTimeout=5 "$KUMA_HOST" \
        "docker logs uptime-kuma --since=1m 2>&1 | grep -c 'WARN.*Failing' || echo 0" 2>/dev/null)
    [[ -z "$post_fails" ]] && post_fails="?"
    if [[ "$post_fails" == "?" ]]; then
        warn "Could not check Kuma"
    elif [[ "$post_fails" -gt 3 ]]; then
        err "Still $post_fails failures — may need manual investigation"
        exit 1
    else
        log "Kuma: $post_fails failures — looks healthy"
    fi
else
    log "No issues detected — all DERPs reachable, Kuma healthy"
fi