130 lines
4.0 KiB
Bash
Executable File
130 lines
4.0 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# fix-derp-connectivity.sh — Diagnose and fix Tailscale DERP relay issues
|
|
#
|
|
# When Kuma monitors start failing across an entire host group (especially
|
|
# Calypso), it's usually because the DERP relay on headscale (Calypso) has
|
|
# become stuck. Restarting headscale forces all peers to re-negotiate paths.
|
|
#
|
|
# Usage:
|
|
# ./fix-derp-connectivity.sh # diagnose + fix
|
|
# ./fix-derp-connectivity.sh --check # diagnose only, no restart
|
|
#
|
|
# Runs from: homelab-vm (where Claude Code runs)
|
|
|
|
set -uo pipefail
|
|
|
|
CHECK_ONLY=false
|
|
[[ "${1:-}" == "--check" ]] && CHECK_ONLY=true
|
|
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
NC='\033[0m'
|
|
|
|
log() { echo -e "${GREEN}[OK]${NC} $*"; }
|
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
|
err() { echo -e "${RED}[FAIL]${NC} $*"; }
|
|
|
|
DERP_NAMES=("home-cal" "home-atl" "sea")
|
|
DERP_URLS=("https://headscale.vish.gg:8443" "https://derp-atl.vish.gg:8445" "https://derp-sea.vish.gg:8444")
|
|
KUMA_HOST="pi-5"
|
|
|
|
echo "=== DERP Connectivity Check ==="
|
|
echo "Time: $(date -Iseconds)"
|
|
echo
|
|
|
|
# 1. Check DERP server reachability
|
|
echo "--- DERP Server Reachability ---"
|
|
derp_ok=0
|
|
derp_fail=0
|
|
for i in "${!DERP_NAMES[@]}"; do
|
|
name="${DERP_NAMES[$i]}"
|
|
url="${DERP_URLS[$i]}"
|
|
if curl -sk --connect-timeout 5 -o /dev/null -w '' "$url" 2>/dev/null; then
|
|
log "$name ($url): reachable"
|
|
((derp_ok++))
|
|
else
|
|
err "$name ($url): UNREACHABLE"
|
|
((derp_fail++))
|
|
fi
|
|
done
|
|
echo
|
|
|
|
# 2. Run netcheck from local machine
|
|
echo "--- Local Tailscale Netcheck ---"
|
|
netcheck=$(tailscale netcheck 2>&1 || true)
|
|
echo "$netcheck" | grep -E 'Nearest DERP|DERP latency' -A5 | head -10
|
|
echo
|
|
|
|
# 3. Check peer connection types
|
|
echo "--- Peer Connection Types ---"
|
|
tailscale status 2>/dev/null | while read -r ip name rest; do
|
|
if echo "$rest" | grep -q "relay"; then
|
|
relay=$(echo "$rest" | grep -oP 'relay "[^"]+"')
|
|
warn "$name ($ip): $relay"
|
|
elif echo "$rest" | grep -q "direct"; then
|
|
direct=$(echo "$rest" | grep -oP 'direct [0-9.]+:[0-9]+')
|
|
log "$name ($ip): $direct"
|
|
fi
|
|
done
|
|
echo
|
|
|
|
# 4. Check Kuma for failing monitors (if reachable)
|
|
echo "--- Kuma Monitor Status ---"
|
|
kuma_fails=$(ssh -o ConnectTimeout=5 "$KUMA_HOST" \
|
|
"docker logs uptime-kuma --since=5m 2>&1 | grep -c 'WARN.*Failing' || echo 0" 2>/dev/null)
|
|
[[ -z "$kuma_fails" ]] && kuma_fails="?"
|
|
if [[ "$kuma_fails" == "?" ]]; then
|
|
warn "Could not reach Kuma on $KUMA_HOST"
|
|
elif [[ "$kuma_fails" -gt 5 ]]; then
|
|
err "Kuma has $kuma_fails failing monitors in last 5 minutes"
|
|
else
|
|
log "Kuma: $kuma_fails failures in last 5 minutes"
|
|
fi
|
|
echo
|
|
|
|
# 5. Check headscale container health
|
|
echo "--- Headscale Status ---"
|
|
hs_status=$(ssh -o ConnectTimeout=5 calypso \
|
|
"sudo /usr/local/bin/docker ps --format '{{.Names}} {{.Status}}' | grep headscale" 2>/dev/null || echo "UNREACHABLE")
|
|
echo " $hs_status"
|
|
echo
|
|
|
|
# 6. Fix if needed
|
|
if [[ "$derp_fail" -gt 0 ]] || [[ "$kuma_fails" != "?" && "$kuma_fails" -gt 5 ]]; then
|
|
echo "=== Issues Detected ==="
|
|
if $CHECK_ONLY; then
|
|
warn "Run without --check to apply fixes"
|
|
exit 1
|
|
fi
|
|
|
|
echo "Restarting headscale (embedded DERP relay)..."
|
|
ssh -o ConnectTimeout=5 calypso "sudo /usr/local/bin/docker restart headscale" 2>/dev/null
|
|
log "Headscale restarted"
|
|
|
|
echo "Waiting 15s for DERP to come back..."
|
|
sleep 15
|
|
|
|
# Re-check
|
|
echo
|
|
echo "--- Post-fix Netcheck ---"
|
|
tailscale netcheck 2>&1 | grep -E 'DERP latency' -A5 | head -8
|
|
|
|
echo
|
|
echo "--- Post-fix Kuma (waiting 60s for monitor cycle) ---"
|
|
sleep 60
|
|
post_fails=$(ssh -o ConnectTimeout=5 "$KUMA_HOST" \
|
|
"docker logs uptime-kuma --since=1m 2>&1 | grep -c 'WARN.*Failing' || echo 0" 2>/dev/null)
|
|
[[ -z "$post_fails" ]] && post_fails="?"
|
|
if [[ "$post_fails" == "?" ]]; then
|
|
warn "Could not check Kuma"
|
|
elif [[ "$post_fails" -gt 3 ]]; then
|
|
err "Still $post_fails failures — may need manual investigation"
|
|
exit 1
|
|
else
|
|
log "Kuma: $post_fails failures — looks healthy"
|
|
fi
|
|
else
|
|
log "No issues detected — all DERPs reachable, Kuma healthy"
|
|
fi
|