Sanitized mirror from private repository - 2026-04-18 11:19:59 UTC
This commit is contained in:
129
scripts/fix-derp-connectivity.sh
Executable file
129
scripts/fix-derp-connectivity.sh
Executable file
@@ -0,0 +1,129 @@
|
||||
#!/usr/bin/env bash
|
||||
# fix-derp-connectivity.sh — Diagnose and fix Tailscale DERP relay issues
|
||||
#
|
||||
# When Kuma monitors start failing across an entire host group (especially
|
||||
# Calypso), it's usually because the DERP relay on headscale (Calypso) has
|
||||
# become stuck. Restarting headscale forces all peers to re-negotiate paths.
|
||||
#
|
||||
# Usage:
|
||||
# ./fix-derp-connectivity.sh # diagnose + fix
|
||||
# ./fix-derp-connectivity.sh --check # diagnose only, no restart
|
||||
#
|
||||
# Runs from: homelab-vm (where Claude Code runs)
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
CHECK_ONLY=false
|
||||
[[ "${1:-}" == "--check" ]] && CHECK_ONLY=true
|
||||
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m'
|
||||
|
||||
log() { echo -e "${GREEN}[OK]${NC} $*"; }
|
||||
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||
err() { echo -e "${RED}[FAIL]${NC} $*"; }
|
||||
|
||||
DERP_NAMES=("home-cal" "home-atl" "sea")
|
||||
DERP_URLS=("https://headscale.vish.gg:8443" "https://derp-atl.vish.gg:8445" "https://derp-sea.vish.gg:8444")
|
||||
KUMA_HOST="pi-5"
|
||||
|
||||
echo "=== DERP Connectivity Check ==="
|
||||
echo "Time: $(date -Iseconds)"
|
||||
echo
|
||||
|
||||
# 1. Check DERP server reachability
|
||||
echo "--- DERP Server Reachability ---"
|
||||
derp_ok=0
|
||||
derp_fail=0
|
||||
for i in "${!DERP_NAMES[@]}"; do
|
||||
name="${DERP_NAMES[$i]}"
|
||||
url="${DERP_URLS[$i]}"
|
||||
if curl -sk --connect-timeout 5 -o /dev/null -w '' "$url" 2>/dev/null; then
|
||||
log "$name ($url): reachable"
|
||||
((derp_ok++))
|
||||
else
|
||||
err "$name ($url): UNREACHABLE"
|
||||
((derp_fail++))
|
||||
fi
|
||||
done
|
||||
echo
|
||||
|
||||
# 2. Run netcheck from local machine
|
||||
echo "--- Local Tailscale Netcheck ---"
|
||||
netcheck=$(tailscale netcheck 2>&1 || true)
|
||||
echo "$netcheck" | grep -E 'Nearest DERP|DERP latency' -A5 | head -10
|
||||
echo
|
||||
|
||||
# 3. Check peer connection types
|
||||
echo "--- Peer Connection Types ---"
|
||||
tailscale status 2>/dev/null | while read -r ip name rest; do
|
||||
if echo "$rest" | grep -q "relay"; then
|
||||
relay=$(echo "$rest" | grep -oP 'relay "[^"]+"')
|
||||
warn "$name ($ip): $relay"
|
||||
elif echo "$rest" | grep -q "direct"; then
|
||||
direct=$(echo "$rest" | grep -oP 'direct [0-9.]+:[0-9]+')
|
||||
log "$name ($ip): $direct"
|
||||
fi
|
||||
done
|
||||
echo
|
||||
|
||||
# 4. Check Kuma for failing monitors (if reachable)
|
||||
echo "--- Kuma Monitor Status ---"
|
||||
kuma_fails=$(ssh -o ConnectTimeout=5 "$KUMA_HOST" \
|
||||
"docker logs uptime-kuma --since=5m 2>&1 | grep -c 'WARN.*Failing' || echo 0" 2>/dev/null)
|
||||
[[ -z "$kuma_fails" ]] && kuma_fails="?"
|
||||
if [[ "$kuma_fails" == "?" ]]; then
|
||||
warn "Could not reach Kuma on $KUMA_HOST"
|
||||
elif [[ "$kuma_fails" -gt 5 ]]; then
|
||||
err "Kuma has $kuma_fails failing monitors in last 5 minutes"
|
||||
else
|
||||
log "Kuma: $kuma_fails failures in last 5 minutes"
|
||||
fi
|
||||
echo
|
||||
|
||||
# 5. Check headscale container health
|
||||
echo "--- Headscale Status ---"
|
||||
hs_status=$(ssh -o ConnectTimeout=5 calypso \
|
||||
"sudo /usr/local/bin/docker ps --format '{{.Names}} {{.Status}}' | grep headscale" 2>/dev/null || echo "UNREACHABLE")
|
||||
echo " $hs_status"
|
||||
echo
|
||||
|
||||
# 6. Fix if needed
|
||||
if [[ "$derp_fail" -gt 0 ]] || [[ "$kuma_fails" != "?" && "$kuma_fails" -gt 5 ]]; then
|
||||
echo "=== Issues Detected ==="
|
||||
if $CHECK_ONLY; then
|
||||
warn "Run without --check to apply fixes"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Restarting headscale (embedded DERP relay)..."
|
||||
ssh -o ConnectTimeout=5 calypso "sudo /usr/local/bin/docker restart headscale" 2>/dev/null
|
||||
log "Headscale restarted"
|
||||
|
||||
echo "Waiting 15s for DERP to come back..."
|
||||
sleep 15
|
||||
|
||||
# Re-check
|
||||
echo
|
||||
echo "--- Post-fix Netcheck ---"
|
||||
tailscale netcheck 2>&1 | grep -E 'DERP latency' -A5 | head -8
|
||||
|
||||
echo
|
||||
echo "--- Post-fix Kuma (waiting 60s for monitor cycle) ---"
|
||||
sleep 60
|
||||
post_fails=$(ssh -o ConnectTimeout=5 "$KUMA_HOST" \
|
||||
"docker logs uptime-kuma --since=1m 2>&1 | grep -c 'WARN.*Failing' || echo 0" 2>/dev/null)
|
||||
[[ -z "$post_fails" ]] && post_fails="?"
|
||||
if [[ "$post_fails" == "?" ]]; then
|
||||
warn "Could not check Kuma"
|
||||
elif [[ "$post_fails" -gt 3 ]]; then
|
||||
err "Still $post_fails failures — may need manual investigation"
|
||||
exit 1
|
||||
else
|
||||
log "Kuma: $post_fails failures — looks healthy"
|
||||
fi
|
||||
else
|
||||
log "No issues detected — all DERPs reachable, Kuma healthy"
|
||||
fi
|
||||
Reference in New Issue
Block a user