#!/usr/bin/env bash # fix-derp-connectivity.sh — Diagnose and fix Tailscale DERP relay issues # # When Kuma monitors start failing across an entire host group (especially # Calypso), it's usually because the DERP relay on headscale (Calypso) has # become stuck. Restarting headscale forces all peers to re-negotiate paths. # # Usage: # ./fix-derp-connectivity.sh # diagnose + fix # ./fix-derp-connectivity.sh --check # diagnose only, no restart # # Runs from: homelab-vm (where Claude Code runs) set -uo pipefail CHECK_ONLY=false [[ "${1:-}" == "--check" ]] && CHECK_ONLY=true RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' log() { echo -e "${GREEN}[OK]${NC} $*"; } warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } err() { echo -e "${RED}[FAIL]${NC} $*"; } DERP_NAMES=("home-cal" "home-atl" "sea") DERP_URLS=("https://headscale.vish.gg:8443" "https://derp-atl.vish.gg:8445" "https://derp-sea.vish.gg:8444") KUMA_HOST="pi-5" echo "=== DERP Connectivity Check ===" echo "Time: $(date -Iseconds)" echo # 1. Check DERP server reachability echo "--- DERP Server Reachability ---" derp_ok=0 derp_fail=0 for i in "${!DERP_NAMES[@]}"; do name="${DERP_NAMES[$i]}" url="${DERP_URLS[$i]}" if curl -sk --connect-timeout 5 -o /dev/null -w '' "$url" 2>/dev/null; then log "$name ($url): reachable" ((derp_ok++)) else err "$name ($url): UNREACHABLE" ((derp_fail++)) fi done echo # 2. Run netcheck from local machine echo "--- Local Tailscale Netcheck ---" netcheck=$(tailscale netcheck 2>&1 || true) echo "$netcheck" | grep -E 'Nearest DERP|DERP latency' -A5 | head -10 echo # 3. Check peer connection types echo "--- Peer Connection Types ---" tailscale status 2>/dev/null | while read -r ip name rest; do if echo "$rest" | grep -q "relay"; then relay=$(echo "$rest" | grep -oP 'relay "[^"]+"') warn "$name ($ip): $relay" elif echo "$rest" | grep -q "direct"; then direct=$(echo "$rest" | grep -oP 'direct [0-9.]+:[0-9]+') log "$name ($ip): $direct" fi done echo # 4. Check Kuma for failing monitors (if reachable) echo "--- Kuma Monitor Status ---" kuma_fails=$(ssh -o ConnectTimeout=5 "$KUMA_HOST" \ "docker logs uptime-kuma --since=5m 2>&1 | grep -c 'WARN.*Failing' || echo 0" 2>/dev/null) [[ -z "$kuma_fails" ]] && kuma_fails="?" if [[ "$kuma_fails" == "?" ]]; then warn "Could not reach Kuma on $KUMA_HOST" elif [[ "$kuma_fails" -gt 5 ]]; then err "Kuma has $kuma_fails failing monitors in last 5 minutes" else log "Kuma: $kuma_fails failures in last 5 minutes" fi echo # 5. Check headscale container health echo "--- Headscale Status ---" hs_status=$(ssh -o ConnectTimeout=5 calypso \ "sudo /usr/local/bin/docker ps --format '{{.Names}} {{.Status}}' | grep headscale" 2>/dev/null || echo "UNREACHABLE") echo " $hs_status" echo # 6. Fix if needed if [[ "$derp_fail" -gt 0 ]] || [[ "$kuma_fails" != "?" && "$kuma_fails" -gt 5 ]]; then echo "=== Issues Detected ===" if $CHECK_ONLY; then warn "Run without --check to apply fixes" exit 1 fi echo "Restarting headscale (embedded DERP relay)..." ssh -o ConnectTimeout=5 calypso "sudo /usr/local/bin/docker restart headscale" 2>/dev/null log "Headscale restarted" echo "Waiting 15s for DERP to come back..." sleep 15 # Re-check echo echo "--- Post-fix Netcheck ---" tailscale netcheck 2>&1 | grep -E 'DERP latency' -A5 | head -8 echo echo "--- Post-fix Kuma (waiting 60s for monitor cycle) ---" sleep 60 post_fails=$(ssh -o ConnectTimeout=5 "$KUMA_HOST" \ "docker logs uptime-kuma --since=1m 2>&1 | grep -c 'WARN.*Failing' || echo 0" 2>/dev/null) [[ -z "$post_fails" ]] && post_fails="?" if [[ "$post_fails" == "?" ]]; then warn "Could not check Kuma" elif [[ "$post_fails" -gt 3 ]]; then err "Still $post_fails failures — may need manual investigation" exit 1 else log "Kuma: $post_fails failures — looks healthy" fi else log "No issues detected — all DERPs reachable, Kuma healthy" fi