Files
homelab-optimized/scripts/fix-derp-connectivity.sh
Gitea Mirror Bot 5735cfcb2c
Some checks failed
Documentation / Build Docusaurus (push) Failing after 5m0s
Documentation / Deploy to GitHub Pages (push) Has been skipped
Sanitized mirror from private repository - 2026-04-08 00:57:50 UTC
2026-04-08 00:57:50 +00:00

130 lines
4.0 KiB
Bash
Executable File

#!/usr/bin/env bash
# fix-derp-connectivity.sh — Diagnose and fix Tailscale DERP relay issues
#
# When Kuma monitors start failing across an entire host group (especially
# Calypso), it's usually because the DERP relay on headscale (Calypso) has
# become stuck. Restarting headscale forces all peers to re-negotiate paths.
#
# Usage:
# ./fix-derp-connectivity.sh # diagnose + fix
# ./fix-derp-connectivity.sh --check # diagnose only, no restart
#
# Runs from: homelab-vm (where Claude Code runs)
set -uo pipefail
CHECK_ONLY=false
[[ "${1:-}" == "--check" ]] && CHECK_ONLY=true
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
log() { echo -e "${GREEN}[OK]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
err() { echo -e "${RED}[FAIL]${NC} $*"; }
DERP_NAMES=("home-cal" "home-atl" "sea")
DERP_URLS=("https://headscale.vish.gg:8443" "https://derp-atl.vish.gg:8445" "https://derp-sea.vish.gg:8444")
KUMA_HOST="pi-5"
echo "=== DERP Connectivity Check ==="
echo "Time: $(date -Iseconds)"
echo
# 1. Check DERP server reachability
echo "--- DERP Server Reachability ---"
derp_ok=0
derp_fail=0
for i in "${!DERP_NAMES[@]}"; do
name="${DERP_NAMES[$i]}"
url="${DERP_URLS[$i]}"
if curl -sk --connect-timeout 5 -o /dev/null -w '' "$url" 2>/dev/null; then
log "$name ($url): reachable"
((derp_ok++))
else
err "$name ($url): UNREACHABLE"
((derp_fail++))
fi
done
echo
# 2. Run netcheck from local machine
echo "--- Local Tailscale Netcheck ---"
netcheck=$(tailscale netcheck 2>&1 || true)
echo "$netcheck" | grep -E 'Nearest DERP|DERP latency' -A5 | head -10
echo
# 3. Check peer connection types
echo "--- Peer Connection Types ---"
tailscale status 2>/dev/null | while read -r ip name rest; do
if echo "$rest" | grep -q "relay"; then
relay=$(echo "$rest" | grep -oP 'relay "[^"]+"')
warn "$name ($ip): $relay"
elif echo "$rest" | grep -q "direct"; then
direct=$(echo "$rest" | grep -oP 'direct [0-9.]+:[0-9]+')
log "$name ($ip): $direct"
fi
done
echo
# 4. Check Kuma for failing monitors (if reachable)
echo "--- Kuma Monitor Status ---"
kuma_fails=$(ssh -o ConnectTimeout=5 "$KUMA_HOST" \
"docker logs uptime-kuma --since=5m 2>&1 | grep -c 'WARN.*Failing' || echo 0" 2>/dev/null)
[[ -z "$kuma_fails" ]] && kuma_fails="?"
if [[ "$kuma_fails" == "?" ]]; then
warn "Could not reach Kuma on $KUMA_HOST"
elif [[ "$kuma_fails" -gt 5 ]]; then
err "Kuma has $kuma_fails failing monitors in last 5 minutes"
else
log "Kuma: $kuma_fails failures in last 5 minutes"
fi
echo
# 5. Check headscale container health
echo "--- Headscale Status ---"
hs_status=$(ssh -o ConnectTimeout=5 calypso \
"sudo /usr/local/bin/docker ps --format '{{.Names}} {{.Status}}' | grep headscale" 2>/dev/null || echo "UNREACHABLE")
echo " $hs_status"
echo
# 6. Fix if needed
if [[ "$derp_fail" -gt 0 ]] || [[ "$kuma_fails" != "?" && "$kuma_fails" -gt 5 ]]; then
echo "=== Issues Detected ==="
if $CHECK_ONLY; then
warn "Run without --check to apply fixes"
exit 1
fi
echo "Restarting headscale (embedded DERP relay)..."
ssh -o ConnectTimeout=5 calypso "sudo /usr/local/bin/docker restart headscale" 2>/dev/null
log "Headscale restarted"
echo "Waiting 15s for DERP to come back..."
sleep 15
# Re-check
echo
echo "--- Post-fix Netcheck ---"
tailscale netcheck 2>&1 | grep -E 'DERP latency' -A5 | head -8
echo
echo "--- Post-fix Kuma (waiting 60s for monitor cycle) ---"
sleep 60
post_fails=$(ssh -o ConnectTimeout=5 "$KUMA_HOST" \
"docker logs uptime-kuma --since=1m 2>&1 | grep -c 'WARN.*Failing' || echo 0" 2>/dev/null)
[[ -z "$post_fails" ]] && post_fails="?"
if [[ "$post_fails" == "?" ]]; then
warn "Could not check Kuma"
elif [[ "$post_fails" -gt 3 ]]; then
err "Still $post_fails failures — may need manual investigation"
exit 1
else
log "Kuma: $post_fails failures — looks healthy"
fi
else
log "No issues detected — all DERPs reachable, Kuma healthy"
fi