homelab-optimized/ansible/automation/playbooks/service_health_deep.yml

---
# Deep Service Health Check Playbook
# Comprehensive health monitoring for all homelab services
# Usage: ansible-playbook playbooks/service_health_deep.yml
# Usage: ansible-playbook playbooks/service_health_deep.yml -e "include_performance=true"
# Usage: ansible-playbook playbooks/service_health_deep.yml -e "alert_on_issues=true"

- name: Deep Service Health Check
  hosts: "{{ host_target | default('all') }}"
  gather_facts: yes
  vars:
    include_performance: "{{ include_performance | default(true) }}"
    alert_on_issues: "{{ alert_on_issues | default(false) }}"
    health_check_timeout: "{{ health_check_timeout | default(30) }}"
    report_dir: "/tmp/health_reports"

    # Service health check configurations
    service_health_checks:
      atlantis:
        - name: "plex"
          container: "plex"
          health_url: "http://localhost:32400/web"
          expected_status: 200
          critical: true
        - name: "immich-server"
          container: "immich-server"
          health_url: "http://localhost:2283/api/server-info/ping"
          expected_status: 200
          critical: true
        - name: "vaultwarden"
          container: "vaultwarden"
          health_url: "http://localhost:80/alive"
          expected_status: 200
          critical: true
        - name: "sonarr"
          container: "sonarr"
          health_url: "http://localhost:8989/api/v3/system/status"
          expected_status: 200
          critical: false
        - name: "radarr"
          container: "radarr"
          health_url: "http://localhost:7878/api/v3/system/status"
          expected_status: 200
          critical: false
      calypso:
        - name: "authentik-server"
          container: "authentik-server"
          health_url: "http://localhost:9000/-/health/live/"
          expected_status: 200
          critical: true
        - name: "paperless-webserver"
          container: "paperless-webserver"
          health_url: "http://localhost:8000"
          expected_status: 200
          critical: false
      homelab_vm:
        - name: "grafana"
          container: "grafana"
          health_url: "http://localhost:3000/api/health"
          expected_status: 200
          critical: true
        - name: "prometheus"
          container: "prometheus"
          health_url: "http://localhost:9090/-/healthy"
          expected_status: 200
          critical: true

  tasks:
    - name: Create health report directory
      file:
        path: "{{ report_dir }}/{{ ansible_date_time.date }}"
        state: directory
        mode: '0755'
      delegate_to: localhost

    - name: Get current service health checks for this host
      set_fact:
        current_health_checks: "{{ service_health_checks.get(inventory_hostname, []) }}"

    - name: Display health check plan
      debug:
        msg: |
          🏥 DEEP HEALTH CHECK PLAN
          =========================
          🖥️  Host: {{ inventory_hostname }}
          📅 Date: {{ ansible_date_time.date }}
          🔍 Services to check: {{ current_health_checks | length }}
          📊 Include Performance: {{ include_performance }}
          🚨 Alert on Issues: {{ alert_on_issues }}
          ⏱️  Timeout: {{ health_check_timeout }}s

          📋 Services:
          {% for service in current_health_checks %}
          - {{ service.name }} ({{ 'Critical' if service.critical else 'Non-critical' }})
          {% endfor %}

    - name: Check Docker daemon health
      shell: |
        echo "=== DOCKER DAEMON HEALTH ==="

        # Check Docker daemon status
        if systemctl is-active --quiet docker; then
          echo "✅ Docker daemon: Running"

          # Check Docker daemon responsiveness
          if timeout 10 docker version >/dev/null 2>&1; then
            echo "✅ Docker API: Responsive"
          else
            echo "❌ Docker API: Unresponsive"
          fi

          # Check Docker disk usage
          docker_usage=$(docker system df --format "table {{.Type}}\t{{.TotalCount}}\t{{.Size}}\t{{.Reclaimable}}")
          echo "📊 Docker Usage:"
          echo "$docker_usage"

        else
          echo "❌ Docker daemon: Not running"
        fi
      register: docker_health
      changed_when: false

    - name: Check container health status
      shell: |
        echo "=== CONTAINER HEALTH STATUS ==="

        health_issues=()
        total_containers=0
        healthy_containers=0

        {% for service in current_health_checks %}
        echo "🔍 Checking {{ service.name }}..."
        total_containers=$((total_containers + 1))

        # Check if container exists and is running
        if docker ps --filter "name={{ service.container }}" --format "{{.Names}}" | grep -q "{{ service.container }}"; then
          echo "  ✅ Container running: {{ service.container }}"

          # Check container health if health check is configured
          health_status=$(docker inspect {{ service.container }} --format='{{.State.Health.Status}}' 2>/dev/null || echo "none")
          if [ "$health_status" != "none" ]; then
            if [ "$health_status" = "healthy" ]; then
              echo "  ✅ Health check: $health_status"
              healthy_containers=$((healthy_containers + 1))
            else
              echo "  ❌ Health check: $health_status"
              health_issues+=("{{ service.name }}:health_check_failed")
            fi
          else
            echo "  ℹ️  No health check configured"
            healthy_containers=$((healthy_containers + 1))  # Assume healthy if no health check
          fi

          # Check container resource usage
          container_stats=$(docker stats {{ service.container }} --no-stream --format "CPU: {{.CPUPerc}}, Memory: {{.MemUsage}}" 2>/dev/null || echo "Stats unavailable")
          echo "  📊 Resources: $container_stats"

        else
          echo "  ❌ Container not running: {{ service.container }}"
          health_issues+=("{{ service.name }}:container_down")
        fi
        echo ""
        {% endfor %}

        echo "📊 CONTAINER SUMMARY:"
        echo "Total containers checked: $total_containers"
        echo "Healthy containers: $healthy_containers"
        echo "Issues found: ${#health_issues[@]}"

        if [ ${#health_issues[@]} -gt 0 ]; then
          echo "🚨 ISSUES:"
          for issue in "${health_issues[@]}"; do
            echo "  - $issue"
          done
        fi
      register: container_health
      changed_when: false

    - name: Test service endpoints
      shell: |
        echo "=== SERVICE ENDPOINT HEALTH ==="

        endpoint_issues=()
        total_endpoints=0
        healthy_endpoints=0

        {% for service in current_health_checks %}
        {% if service.health_url is defined %}
        echo "🌐 Testing {{ service.name }} endpoint..."
        total_endpoints=$((total_endpoints + 1))

        # Test HTTP endpoint
        response_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time {{ health_check_timeout }} "{{ service.health_url }}" 2>/dev/null || echo "000")
        response_time=$(curl -s -o /dev/null -w "%{time_total}" --max-time {{ health_check_timeout }} "{{ service.health_url }}" 2>/dev/null || echo "timeout")

        if [ "$response_code" = "{{ service.expected_status }}" ]; then
          echo "  ✅ HTTP $response_code (${response_time}s): {{ service.health_url }}"
          healthy_endpoints=$((healthy_endpoints + 1))
        else
          echo "  ❌ HTTP $response_code (expected {{ service.expected_status }}): {{ service.health_url }}"
          endpoint_issues+=("{{ service.name }}:http_$response_code")
        fi
        {% endif %}
        {% endfor %}

        echo ""
        echo "📊 ENDPOINT SUMMARY:"
        echo "Total endpoints tested: $total_endpoints"
        echo "Healthy endpoints: $healthy_endpoints"
        echo "Issues found: ${#endpoint_issues[@]}"

        if [ ${#endpoint_issues[@]} -gt 0 ]; then
          echo "🚨 ENDPOINT ISSUES:"
          for issue in "${endpoint_issues[@]}"; do
            echo "  - $issue"
          done
        fi
      register: endpoint_health
      changed_when: false

    - name: Check system resources and performance
      shell: |
        echo "=== SYSTEM PERFORMANCE ==="

        # CPU usage
        cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
        echo "🖥️  CPU Usage: ${cpu_usage}%"

        # Memory usage
        memory_info=$(free -h | awk 'NR==2{printf "Used: %s/%s (%.1f%%)", $3, $2, $3*100/$2}')
        echo "💾 Memory: $memory_info"

        # Disk usage for critical paths
        echo "💿 Disk Usage:"
        df -h / | tail -1 | awk '{printf "  Root: %s used (%s)\n", $5, $4}'

        {% if inventory_hostname in ['atlantis', 'calypso'] %}
        # Synology specific checks
        if [ -d "/volume1" ]; then
          df -h /volume1 | tail -1 | awk '{printf "  Volume1: %s used (%s)\n", $5, $4}'
        fi
        {% endif %}

        # Load average
        load_avg=$(uptime | awk -F'load average:' '{print $2}')
        echo "⚖️  Load Average:$load_avg"

        # Network connectivity
        echo "🌐 Network:"
        if ping -c 1 8.8.8.8 >/dev/null 2>&1; then
          echo "  ✅ Internet connectivity"
        else
          echo "  ❌ Internet connectivity failed"
        fi

        # Tailscale status
        if command -v tailscale >/dev/null 2>&1; then
          tailscale_status=$(tailscale status --json 2>/dev/null | jq -r '.Self.Online' 2>/dev/null || echo "unknown")
          if [ "$tailscale_status" = "true" ]; then
            echo "  ✅ Tailscale connected"
          else
            echo "  ❌ Tailscale status: $tailscale_status"
          fi
        fi
      register: system_performance
      when: include_performance | bool
      changed_when: false

    - name: Check critical service dependencies
      shell: |
        echo "=== SERVICE DEPENDENCIES ==="

        dependency_issues=()

        # Check database connections for services that need them
        {% for service in current_health_checks %}
        {% if service.name in ['immich-server', 'vaultwarden', 'authentik-server', 'paperless-webserver'] %}
        echo "🔍 Checking {{ service.name }} database dependency..."

        # Try to find associated database container
        db_container=""
        case "{{ service.name }}" in
          "immich-server") db_container="immich-db" ;;
          "vaultwarden") db_container="vaultwarden-db" ;;
          "authentik-server") db_container="authentik-db" ;;
          "paperless-webserver") db_container="paperless-db" ;;
        esac

        if [ -n "$db_container" ]; then
          if docker ps --filter "name=$db_container" --format "{{.Names}}" | grep -q "$db_container"; then
            echo "  ✅ Database container running: $db_container"

            # Test database connection
            if docker exec "$db_container" pg_isready >/dev/null 2>&1; then
              echo "  ✅ Database accepting connections"
            else
              echo "  ❌ Database not accepting connections"
              dependency_issues+=("{{ service.name }}:database_connection")
            fi
          else
            echo "  ❌ Database container not running: $db_container"
            dependency_issues+=("{{ service.name }}:database_down")
          fi
        fi
        {% endif %}
        {% endfor %}

        # Check Redis dependencies
        {% for service in current_health_checks %}
        {% if service.name in ['immich-server'] %}
        echo "🔍 Checking {{ service.name }} Redis dependency..."

        redis_container=""
        case "{{ service.name }}" in
          "immich-server") redis_container="immich-redis" ;;
        esac

        if [ -n "$redis_container" ]; then
          if docker ps --filter "name=$redis_container" --format "{{.Names}}" | grep -q "$redis_container"; then
            echo "  ✅ Redis container running: $redis_container"

            # Test Redis connection
            if docker exec "$redis_container" redis-cli ping | grep -q "PONG"; then
              echo "  ✅ Redis responding to ping"
            else
              echo "  ❌ Redis not responding"
              dependency_issues+=("{{ service.name }}:redis_connection")
            fi
          else
            echo "  ❌ Redis container not running: $redis_container"
            dependency_issues+=("{{ service.name }}:redis_down")
          fi
        fi
        {% endif %}
        {% endfor %}

        echo ""
        echo "📊 DEPENDENCY SUMMARY:"
        echo "Issues found: ${#dependency_issues[@]}"

        if [ ${#dependency_issues[@]} -gt 0 ]; then
          echo "🚨 DEPENDENCY ISSUES:"
          for issue in "${dependency_issues[@]}"; do
            echo "  - $issue"
          done
        fi
      register: dependency_health
      changed_when: false

    - name: Analyze service logs for errors
      shell: |
        echo "=== SERVICE LOG ANALYSIS ==="

        log_issues=()

        {% for service in current_health_checks %}
        echo "📝 Analyzing {{ service.name }} logs..."

        if docker ps --filter "name={{ service.container }}" --format "{{.Names}}" | grep -q "{{ service.container }}"; then
          # Get recent logs and check for errors
          error_count=$(docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(error|exception|failed|fatal|panic)" | wc -l)
          warn_count=$(docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(warn|warning)" | wc -l)

          echo "  Errors (1h): $error_count"
          echo "  Warnings (1h): $warn_count"

          if [ $error_count -gt 10 ]; then
            echo "  ⚠️  High error count detected"
            log_issues+=("{{ service.name }}:high_error_count:$error_count")
          elif [ $error_count -gt 0 ]; then
            echo "  ℹ️  Some errors detected"
          else
            echo "  ✅ No errors in recent logs"
          fi

          # Show recent critical errors
          if [ $error_count -gt 0 ]; then
            echo "  Recent errors:"
            docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(error|exception|failed|fatal|panic)" | tail -3 | sed 's/^/    /'
          fi
        else
          echo "  ❌ Container not running"
        fi
        echo ""
        {% endfor %}

        echo "📊 LOG ANALYSIS SUMMARY:"
        echo "Issues found: ${#log_issues[@]}"

        if [ ${#log_issues[@]} -gt 0 ]; then
          echo "🚨 LOG ISSUES:"
          for issue in "${log_issues[@]}"; do
            echo "  - $issue"
          done
        fi
      register: log_analysis
      changed_when: false

    - name: Generate comprehensive health report
      copy:
        content: |
          🏥 DEEP SERVICE HEALTH REPORT - {{ inventory_hostname }}
          =====================================================

          📅 Health Check Date: {{ ansible_date_time.iso8601 }}
          🖥️  Host: {{ inventory_hostname }}
          📊 Services Checked: {{ current_health_checks | length }}
          ⏱️  Check Timeout: {{ health_check_timeout }}s

          🐳 DOCKER DAEMON HEALTH:
          {{ docker_health.stdout }}

          📦 CONTAINER HEALTH:
          {{ container_health.stdout }}

          🌐 ENDPOINT HEALTH:
          {{ endpoint_health.stdout }}

          {% if include_performance %}
          📊 SYSTEM PERFORMANCE:
          {{ system_performance.stdout }}
          {% endif %}

          🔗 SERVICE DEPENDENCIES:
          {{ dependency_health.stdout }}

          📝 LOG ANALYSIS:
          {{ log_analysis.stdout }}

          🎯 CRITICAL SERVICES STATUS:
          {% for service in current_health_checks %}
          {% if service.critical %}
          - {{ service.name }}: {% if service.container in container_health.stdout %}✅ Running{% else %}❌ Issues{% endif %}
          {% endif %}
          {% endfor %}

          💡 RECOMMENDATIONS:
          {% if 'Issues found: 0' not in container_health.stdout %}
          - 🚨 Address container issues immediately
          {% endif %}
          {% if 'Issues found: 0' not in endpoint_health.stdout %}
          - 🌐 Check service endpoint connectivity
          {% endif %}
          {% if 'Issues found: 0' not in dependency_health.stdout %}
          - 🔗 Resolve service dependency issues
          {% endif %}
          - 📊 Monitor resource usage trends
          - 🔄 Schedule regular health checks
          - 📝 Set up log monitoring alerts

          ✅ HEALTH CHECK COMPLETE

        dest: "{{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_report.txt"
      delegate_to: localhost

    - name: Create health status JSON for automation
      copy:
        content: |
          {
            "timestamp": "{{ ansible_date_time.iso8601 }}",
            "hostname": "{{ inventory_hostname }}",
            "health_check_summary": {
              "total_services": {{ current_health_checks | length }},
              "critical_services": {{ current_health_checks | selectattr('critical', 'equalto', true) | list | length }},
              "docker_healthy": {{ 'true' if 'Docker daemon: Running' in docker_health.stdout else 'false' }},
              "overall_status": "{% if 'Issues found: 0' in container_health.stdout and 'Issues found: 0' in endpoint_health.stdout %}HEALTHY{% else %}ISSUES_DETECTED{% endif %}"
            },
            "services": [
              {% for service in current_health_checks %}
              {
                "name": "{{ service.name }}",
                "container": "{{ service.container }}",
                "critical": {{ service.critical | lower }},
                "status": "{% if service.container in container_health.stdout %}running{% else %}down{% endif %}"
              }{% if not loop.last %},{% endif %}
              {% endfor %}
            ]
          }
        dest: "{{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_status.json"
      delegate_to: localhost

    - name: Display health check summary
      debug:
        msg: |

          🏥 DEEP HEALTH CHECK COMPLETE - {{ inventory_hostname }}
          ===============================================

          📅 Date: {{ ansible_date_time.date }}
          📊 Services: {{ current_health_checks | length }}

          🎯 CRITICAL SERVICES:
          {% for service in current_health_checks %}
          {% if service.critical %}
          - {{ service.name }}: {% if service.container in container_health.stdout %}✅ OK{% else %}❌ ISSUES{% endif %}
          {% endif %}
          {% endfor %}

          📊 SUMMARY:
          - Docker: {{ '✅ Healthy' if 'Docker daemon: Running' in docker_health.stdout else '❌ Issues' }}
          - Containers: {{ '✅ All OK' if 'Issues found: 0' in container_health.stdout else '⚠️ Issues Found' }}
          - Endpoints: {{ '✅ All OK' if 'Issues found: 0' in endpoint_health.stdout else '⚠️ Issues Found' }}
          - Dependencies: {{ '✅ All OK' if 'Issues found: 0' in dependency_health.stdout else '⚠️ Issues Found' }}

          📄 Reports:
          - {{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_report.txt
          - {{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_status.json

          🔍 Next Steps:
          - Review detailed report for specific issues
          - Address any critical service problems
          - Schedule regular health monitoring

          ===============================================

    - name: Send health alerts (if issues detected)
      debug:
        msg: |
          🚨 HEALTH ALERT - {{ inventory_hostname }}
          Critical issues detected in service health check!
          Check the detailed report immediately.
      when:
        - alert_on_issues | bool
        - "'ISSUES_DETECTED' in lookup('file', report_dir + '/' + ansible_date_time.date + '/' + inventory_hostname + '_health_status.json')"