#!/bin/bash # Portainer watchdog — recovers from chisel panic crashes that leave # orphaned docker-proxy processes blocking port re-allocation. # # Deploy to atlantis: /usr/local/bin/watchdog-portainer.sh # Cron (every 5 min): */5 * * * * /usr/local/bin/watchdog-portainer.sh DOCKER=/usr/local/bin/docker CONTAINER=portainer PORTS=(8000 9443 10000) NTFY_URL="http://localhost:48978/watchdog" LOG_TAG="watchdog-portainer" log() { logger -t "$LOG_TAG" "$*"; } notify() { local title="$1" msg="$2" priority="${3:-default}" curl -s -o /dev/null \ -H "Title: $title" \ -H "Priority: $priority" \ -d "$msg" \ "$NTFY_URL" || true } # Is portainer already running? if sudo $DOCKER ps --filter "name=^/${CONTAINER}$" --format '{{.Names}}' | grep -q "^${CONTAINER}$"; then exit 0 fi # Container exists but isn't running — try to start it log "Portainer not running — attempting start" start_output=$(sudo $DOCKER start "$CONTAINER" 2>&1) if [ $? -eq 0 ]; then log "Portainer started successfully" notify "Portainer recovered" "Started successfully on atlantis" "default" exit 0 fi # Start failed — check if it's a port conflict from orphaned docker-proxy processes if echo "$start_output" | grep -q "port is already allocated"; then log "Port conflict detected — cleaning up orphaned docker-proxy processes" killed_any=false for port in "${PORTS[@]}"; do # Find docker-proxy PIDs holding these specific TCP ports pids=$(sudo netstat -tulpn 2>/dev/null \ | awk -v p="$port" '$4 ~ ":"p"$" && $7 ~ /docker-proxy/ {split($7,a,"/"); print a[1]}') for pid in $pids; do log "Killing orphaned docker-proxy PID $pid (port $port)" sudo kill "$pid" && killed_any=true done done if $killed_any; then sleep 2 start_output=$(sudo $DOCKER start "$CONTAINER" 2>&1) if [ $? -eq 0 ]; then log "Portainer started after port cleanup" notify "Portainer recovered" "Cleared orphaned docker-proxy processes and started successfully on atlantis" "default" exit 0 fi fi fi # Still failed — escalate log "ERROR: Could not recover Portainer: $start_output" notify "Portainer recovery FAILED" "Could not start on atlantis — manual intervention needed.\n\n$start_output" "urgent" exit 1