--- # Deep Service Health Check Playbook # Comprehensive health monitoring for all homelab services # Usage: ansible-playbook playbooks/service_health_deep.yml # Usage: ansible-playbook playbooks/service_health_deep.yml -e "include_performance=true" # Usage: ansible-playbook playbooks/service_health_deep.yml -e "alert_on_issues=true" - name: Deep Service Health Check hosts: "{{ host_target | default('all') }}" gather_facts: yes vars: include_performance: "{{ include_performance | default(true) }}" alert_on_issues: "{{ alert_on_issues | default(false) }}" health_check_timeout: "{{ health_check_timeout | default(30) }}" report_dir: "/tmp/health_reports" # Service health check configurations service_health_checks: atlantis: - name: "plex" container: "plex" health_url: "http://localhost:32400/web" expected_status: 200 critical: true - name: "immich-server" container: "immich-server" health_url: "http://localhost:2283/api/server-info/ping" expected_status: 200 critical: true - name: "vaultwarden" container: "vaultwarden" health_url: "http://localhost:80/alive" expected_status: 200 critical: true - name: "sonarr" container: "sonarr" health_url: "http://localhost:8989/api/v3/system/status" expected_status: 200 critical: false - name: "radarr" container: "radarr" health_url: "http://localhost:7878/api/v3/system/status" expected_status: 200 critical: false calypso: - name: "authentik-server" container: "authentik-server" health_url: "http://localhost:9000/-/health/live/" expected_status: 200 critical: true - name: "paperless-webserver" container: "paperless-webserver" health_url: "http://localhost:8000" expected_status: 200 critical: false homelab_vm: - name: "grafana" container: "grafana" health_url: "http://localhost:3000/api/health" expected_status: 200 critical: true - name: "prometheus" container: "prometheus" health_url: "http://localhost:9090/-/healthy" expected_status: 200 critical: true tasks: - name: Create health report directory file: path: "{{ report_dir }}/{{ ansible_date_time.date }}" state: directory mode: '0755' delegate_to: localhost - name: Get current service health checks for this host set_fact: current_health_checks: "{{ service_health_checks.get(inventory_hostname, []) }}" - name: Display health check plan debug: msg: | đŸĨ DEEP HEALTH CHECK PLAN ========================= đŸ–Ĩī¸ Host: {{ inventory_hostname }} 📅 Date: {{ ansible_date_time.date }} 🔍 Services to check: {{ current_health_checks | length }} 📊 Include Performance: {{ include_performance }} 🚨 Alert on Issues: {{ alert_on_issues }} âąī¸ Timeout: {{ health_check_timeout }}s 📋 Services: {% for service in current_health_checks %} - {{ service.name }} ({{ 'Critical' if service.critical else 'Non-critical' }}) {% endfor %} - name: Check Docker daemon health shell: | echo "=== DOCKER DAEMON HEALTH ===" # Check Docker daemon status if systemctl is-active --quiet docker; then echo "✅ Docker daemon: Running" # Check Docker daemon responsiveness if timeout 10 docker version >/dev/null 2>&1; then echo "✅ Docker API: Responsive" else echo "❌ Docker API: Unresponsive" fi # Check Docker disk usage docker_usage=$(docker system df --format "table {{.Type}}\t{{.TotalCount}}\t{{.Size}}\t{{.Reclaimable}}") echo "📊 Docker Usage:" echo "$docker_usage" else echo "❌ Docker daemon: Not running" fi register: docker_health changed_when: false - name: Check container health status shell: | echo "=== CONTAINER HEALTH STATUS ===" health_issues=() total_containers=0 healthy_containers=0 {% for service in current_health_checks %} echo "🔍 Checking {{ service.name }}..." total_containers=$((total_containers + 1)) # Check if container exists and is running if docker ps --filter "name={{ service.container }}" --format "{{.Names}}" | grep -q "{{ service.container }}"; then echo " ✅ Container running: {{ service.container }}" # Check container health if health check is configured health_status=$(docker inspect {{ service.container }} --format='{{.State.Health.Status}}' 2>/dev/null || echo "none") if [ "$health_status" != "none" ]; then if [ "$health_status" = "healthy" ]; then echo " ✅ Health check: $health_status" healthy_containers=$((healthy_containers + 1)) else echo " ❌ Health check: $health_status" health_issues+=("{{ service.name }}:health_check_failed") fi else echo " â„šī¸ No health check configured" healthy_containers=$((healthy_containers + 1)) # Assume healthy if no health check fi # Check container resource usage container_stats=$(docker stats {{ service.container }} --no-stream --format "CPU: {{.CPUPerc}}, Memory: {{.MemUsage}}" 2>/dev/null || echo "Stats unavailable") echo " 📊 Resources: $container_stats" else echo " ❌ Container not running: {{ service.container }}" health_issues+=("{{ service.name }}:container_down") fi echo "" {% endfor %} echo "📊 CONTAINER SUMMARY:" echo "Total containers checked: $total_containers" echo "Healthy containers: $healthy_containers" echo "Issues found: ${#health_issues[@]}" if [ ${#health_issues[@]} -gt 0 ]; then echo "🚨 ISSUES:" for issue in "${health_issues[@]}"; do echo " - $issue" done fi register: container_health changed_when: false - name: Test service endpoints shell: | echo "=== SERVICE ENDPOINT HEALTH ===" endpoint_issues=() total_endpoints=0 healthy_endpoints=0 {% for service in current_health_checks %} {% if service.health_url is defined %} echo "🌐 Testing {{ service.name }} endpoint..." total_endpoints=$((total_endpoints + 1)) # Test HTTP endpoint response_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time {{ health_check_timeout }} "{{ service.health_url }}" 2>/dev/null || echo "000") response_time=$(curl -s -o /dev/null -w "%{time_total}" --max-time {{ health_check_timeout }} "{{ service.health_url }}" 2>/dev/null || echo "timeout") if [ "$response_code" = "{{ service.expected_status }}" ]; then echo " ✅ HTTP $response_code (${response_time}s): {{ service.health_url }}" healthy_endpoints=$((healthy_endpoints + 1)) else echo " ❌ HTTP $response_code (expected {{ service.expected_status }}): {{ service.health_url }}" endpoint_issues+=("{{ service.name }}:http_$response_code") fi {% endif %} {% endfor %} echo "" echo "📊 ENDPOINT SUMMARY:" echo "Total endpoints tested: $total_endpoints" echo "Healthy endpoints: $healthy_endpoints" echo "Issues found: ${#endpoint_issues[@]}" if [ ${#endpoint_issues[@]} -gt 0 ]; then echo "🚨 ENDPOINT ISSUES:" for issue in "${endpoint_issues[@]}"; do echo " - $issue" done fi register: endpoint_health changed_when: false - name: Check system resources and performance shell: | echo "=== SYSTEM PERFORMANCE ===" # CPU usage cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1) echo "đŸ–Ĩī¸ CPU Usage: ${cpu_usage}%" # Memory usage memory_info=$(free -h | awk 'NR==2{printf "Used: %s/%s (%.1f%%)", $3, $2, $3*100/$2}') echo "💾 Memory: $memory_info" # Disk usage for critical paths echo "đŸ’ŋ Disk Usage:" df -h / | tail -1 | awk '{printf " Root: %s used (%s)\n", $5, $4}' {% if inventory_hostname in ['atlantis', 'calypso'] %} # Synology specific checks if [ -d "/volume1" ]; then df -h /volume1 | tail -1 | awk '{printf " Volume1: %s used (%s)\n", $5, $4}' fi {% endif %} # Load average load_avg=$(uptime | awk -F'load average:' '{print $2}') echo "âš–ī¸ Load Average:$load_avg" # Network connectivity echo "🌐 Network:" if ping -c 1 8.8.8.8 >/dev/null 2>&1; then echo " ✅ Internet connectivity" else echo " ❌ Internet connectivity failed" fi # Tailscale status if command -v tailscale >/dev/null 2>&1; then tailscale_status=$(tailscale status --json 2>/dev/null | jq -r '.Self.Online' 2>/dev/null || echo "unknown") if [ "$tailscale_status" = "true" ]; then echo " ✅ Tailscale connected" else echo " ❌ Tailscale status: $tailscale_status" fi fi register: system_performance when: include_performance | bool changed_when: false - name: Check critical service dependencies shell: | echo "=== SERVICE DEPENDENCIES ===" dependency_issues=() # Check database connections for services that need them {% for service in current_health_checks %} {% if service.name in ['immich-server', 'vaultwarden', 'authentik-server', 'paperless-webserver'] %} echo "🔍 Checking {{ service.name }} database dependency..." # Try to find associated database container db_container="" case "{{ service.name }}" in "immich-server") db_container="immich-db" ;; "vaultwarden") db_container="vaultwarden-db" ;; "authentik-server") db_container="authentik-db" ;; "paperless-webserver") db_container="paperless-db" ;; esac if [ -n "$db_container" ]; then if docker ps --filter "name=$db_container" --format "{{.Names}}" | grep -q "$db_container"; then echo " ✅ Database container running: $db_container" # Test database connection if docker exec "$db_container" pg_isready >/dev/null 2>&1; then echo " ✅ Database accepting connections" else echo " ❌ Database not accepting connections" dependency_issues+=("{{ service.name }}:database_connection") fi else echo " ❌ Database container not running: $db_container" dependency_issues+=("{{ service.name }}:database_down") fi fi {% endif %} {% endfor %} # Check Redis dependencies {% for service in current_health_checks %} {% if service.name in ['immich-server'] %} echo "🔍 Checking {{ service.name }} Redis dependency..." redis_container="" case "{{ service.name }}" in "immich-server") redis_container="immich-redis" ;; esac if [ -n "$redis_container" ]; then if docker ps --filter "name=$redis_container" --format "{{.Names}}" | grep -q "$redis_container"; then echo " ✅ Redis container running: $redis_container" # Test Redis connection if docker exec "$redis_container" redis-cli ping | grep -q "PONG"; then echo " ✅ Redis responding to ping" else echo " ❌ Redis not responding" dependency_issues+=("{{ service.name }}:redis_connection") fi else echo " ❌ Redis container not running: $redis_container" dependency_issues+=("{{ service.name }}:redis_down") fi fi {% endif %} {% endfor %} echo "" echo "📊 DEPENDENCY SUMMARY:" echo "Issues found: ${#dependency_issues[@]}" if [ ${#dependency_issues[@]} -gt 0 ]; then echo "🚨 DEPENDENCY ISSUES:" for issue in "${dependency_issues[@]}"; do echo " - $issue" done fi register: dependency_health changed_when: false - name: Analyze service logs for errors shell: | echo "=== SERVICE LOG ANALYSIS ===" log_issues=() {% for service in current_health_checks %} echo "📝 Analyzing {{ service.name }} logs..." if docker ps --filter "name={{ service.container }}" --format "{{.Names}}" | grep -q "{{ service.container }}"; then # Get recent logs and check for errors error_count=$(docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(error|exception|failed|fatal|panic)" | wc -l) warn_count=$(docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(warn|warning)" | wc -l) echo " Errors (1h): $error_count" echo " Warnings (1h): $warn_count" if [ $error_count -gt 10 ]; then echo " âš ī¸ High error count detected" log_issues+=("{{ service.name }}:high_error_count:$error_count") elif [ $error_count -gt 0 ]; then echo " â„šī¸ Some errors detected" else echo " ✅ No errors in recent logs" fi # Show recent critical errors if [ $error_count -gt 0 ]; then echo " Recent errors:" docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(error|exception|failed|fatal|panic)" | tail -3 | sed 's/^/ /' fi else echo " ❌ Container not running" fi echo "" {% endfor %} echo "📊 LOG ANALYSIS SUMMARY:" echo "Issues found: ${#log_issues[@]}" if [ ${#log_issues[@]} -gt 0 ]; then echo "🚨 LOG ISSUES:" for issue in "${log_issues[@]}"; do echo " - $issue" done fi register: log_analysis changed_when: false - name: Generate comprehensive health report copy: content: | đŸĨ DEEP SERVICE HEALTH REPORT - {{ inventory_hostname }} ===================================================== 📅 Health Check Date: {{ ansible_date_time.iso8601 }} đŸ–Ĩī¸ Host: {{ inventory_hostname }} 📊 Services Checked: {{ current_health_checks | length }} âąī¸ Check Timeout: {{ health_check_timeout }}s đŸŗ DOCKER DAEMON HEALTH: {{ docker_health.stdout }} đŸ“Ļ CONTAINER HEALTH: {{ container_health.stdout }} 🌐 ENDPOINT HEALTH: {{ endpoint_health.stdout }} {% if include_performance %} 📊 SYSTEM PERFORMANCE: {{ system_performance.stdout }} {% endif %} 🔗 SERVICE DEPENDENCIES: {{ dependency_health.stdout }} 📝 LOG ANALYSIS: {{ log_analysis.stdout }} đŸŽ¯ CRITICAL SERVICES STATUS: {% for service in current_health_checks %} {% if service.critical %} - {{ service.name }}: {% if service.container in container_health.stdout %}✅ Running{% else %}❌ Issues{% endif %} {% endif %} {% endfor %} 💡 RECOMMENDATIONS: {% if 'Issues found: 0' not in container_health.stdout %} - 🚨 Address container issues immediately {% endif %} {% if 'Issues found: 0' not in endpoint_health.stdout %} - 🌐 Check service endpoint connectivity {% endif %} {% if 'Issues found: 0' not in dependency_health.stdout %} - 🔗 Resolve service dependency issues {% endif %} - 📊 Monitor resource usage trends - 🔄 Schedule regular health checks - 📝 Set up log monitoring alerts ✅ HEALTH CHECK COMPLETE dest: "{{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_report.txt" delegate_to: localhost - name: Create health status JSON for automation copy: content: | { "timestamp": "{{ ansible_date_time.iso8601 }}", "hostname": "{{ inventory_hostname }}", "health_check_summary": { "total_services": {{ current_health_checks | length }}, "critical_services": {{ current_health_checks | selectattr('critical', 'equalto', true) | list | length }}, "docker_healthy": {{ 'true' if 'Docker daemon: Running' in docker_health.stdout else 'false' }}, "overall_status": "{% if 'Issues found: 0' in container_health.stdout and 'Issues found: 0' in endpoint_health.stdout %}HEALTHY{% else %}ISSUES_DETECTED{% endif %}" }, "services": [ {% for service in current_health_checks %} { "name": "{{ service.name }}", "container": "{{ service.container }}", "critical": {{ service.critical | lower }}, "status": "{% if service.container in container_health.stdout %}running{% else %}down{% endif %}" }{% if not loop.last %},{% endif %} {% endfor %} ] } dest: "{{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_status.json" delegate_to: localhost - name: Display health check summary debug: msg: | đŸĨ DEEP HEALTH CHECK COMPLETE - {{ inventory_hostname }} =============================================== 📅 Date: {{ ansible_date_time.date }} 📊 Services: {{ current_health_checks | length }} đŸŽ¯ CRITICAL SERVICES: {% for service in current_health_checks %} {% if service.critical %} - {{ service.name }}: {% if service.container in container_health.stdout %}✅ OK{% else %}❌ ISSUES{% endif %} {% endif %} {% endfor %} 📊 SUMMARY: - Docker: {{ '✅ Healthy' if 'Docker daemon: Running' in docker_health.stdout else '❌ Issues' }} - Containers: {{ '✅ All OK' if 'Issues found: 0' in container_health.stdout else 'âš ī¸ Issues Found' }} - Endpoints: {{ '✅ All OK' if 'Issues found: 0' in endpoint_health.stdout else 'âš ī¸ Issues Found' }} - Dependencies: {{ '✅ All OK' if 'Issues found: 0' in dependency_health.stdout else 'âš ī¸ Issues Found' }} 📄 Reports: - {{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_report.txt - {{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_status.json 🔍 Next Steps: - Review detailed report for specific issues - Address any critical service problems - Schedule regular health monitoring =============================================== - name: Send health alerts (if issues detected) debug: msg: | 🚨 HEALTH ALERT - {{ inventory_hostname }} Critical issues detected in service health check! Check the detailed report immediately. when: - alert_on_issues | bool - "'ISSUES_DETECTED' in lookup('file', report_dir + '/' + ansible_date_time.date + '/' + inventory_hostname + '_health_status.json')"