--- - name: Comprehensive Health Check hosts: all gather_facts: yes vars: health_check_timestamp: "{{ ansible_date_time.iso8601 }}" critical_services: - docker - ssh - tailscaled health_thresholds: cpu_warning: 80 cpu_critical: 95 memory_warning: 85 memory_critical: 95 disk_warning: 85 disk_critical: 95 tasks: - name: Create health check report directory file: path: "/tmp/health_reports" state: directory mode: '0755' delegate_to: localhost run_once: true - name: Check system uptime shell: uptime -p register: system_uptime changed_when: false - name: Check CPU usage shell: | top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1 | cut -d',' -f1 register: cpu_usage changed_when: false - name: Check memory usage shell: | free | awk 'NR==2{printf "%.1f", $3*100/$2}' register: memory_usage changed_when: false - name: Check disk usage shell: | df -h / | awk 'NR==2{print $5}' | sed 's/%//' register: disk_usage changed_when: false - name: Check load average shell: | uptime | awk -F'load average:' '{print $2}' | sed 's/^ *//' register: load_average changed_when: false - name: Check critical services (systemd hosts only) systemd: name: "{{ item }}" register: service_status loop: "{{ critical_services }}" ignore_errors: yes when: ansible_service_mgr == "systemd" - name: Check critical services via pgrep (non-systemd hosts — Synology DSM etc.) shell: "pgrep -x {{ item }} >/dev/null 2>&1 && echo 'active' || echo 'inactive'" register: service_status_pgrep loop: "{{ critical_services }}" changed_when: false ignore_errors: yes when: ansible_service_mgr != "systemd" - name: Check Docker containers (if Docker is running) shell: | if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then echo "Running: $(docker ps -q | wc -l)" echo "Total: $(docker ps -aq | wc -l)" echo "Unhealthy: $(docker ps --filter health=unhealthy -q | wc -l)" else echo "Docker not available" fi register: docker_status changed_when: false ignore_errors: yes - name: Check network connectivity shell: | ping -c 1 8.8.8.8 >/dev/null 2>&1 && echo "OK" || echo "FAILED" register: internet_check changed_when: false - name: Check Tailscale status shell: | if command -v tailscale >/dev/null 2>&1; then tailscale status --json | jq -r '.Self.Online' 2>/dev/null || echo "unknown" else echo "not_installed" fi register: tailscale_status changed_when: false ignore_errors: yes - name: Evaluate health status set_fact: health_status: overall: >- {{ 'CRITICAL' if ( (cpu_usage.stdout | float > health_thresholds.cpu_critical) or (memory_usage.stdout | float > health_thresholds.memory_critical) or (disk_usage.stdout | int > health_thresholds.disk_critical) or (internet_check.stdout == "FAILED") ) else 'WARNING' if ( (cpu_usage.stdout | float > health_thresholds.cpu_warning) or (memory_usage.stdout | float > health_thresholds.memory_warning) or (disk_usage.stdout | int > health_thresholds.disk_warning) ) else 'HEALTHY' }} cpu: "{{ cpu_usage.stdout | float }}" memory: "{{ memory_usage.stdout | float }}" disk: "{{ disk_usage.stdout | int }}" uptime: "{{ system_uptime.stdout }}" load: "{{ load_average.stdout }}" internet: "{{ internet_check.stdout }}" tailscale: "{{ tailscale_status.stdout }}" - name: Display health report debug: msg: | ========================================== 🏥 HEALTH CHECK REPORT - {{ inventory_hostname }} ========================================== 📊 OVERALL STATUS: {{ health_status.overall }} 🖥️ SYSTEM METRICS: - Uptime: {{ health_status.uptime }} - CPU Usage: {{ health_status.cpu }}% - Memory Usage: {{ health_status.memory }}% - Disk Usage: {{ health_status.disk }}% - Load Average: {{ health_status.load }} 🌐 CONNECTIVITY: - Internet: {{ health_status.internet }} - Tailscale: {{ health_status.tailscale }} 🐳 DOCKER STATUS: {{ docker_status.stdout }} 🔧 CRITICAL SERVICES: {% if ansible_service_mgr == "systemd" and service_status is defined %} {% for result in service_status.results %} {% if result.status is defined and result.status.ActiveState is defined %} - {{ result.item }}: {{ 'RUNNING' if result.status.ActiveState == 'active' else 'STOPPED' }} {% elif not result.skipped | default(false) %} - {{ result.item }}: UNKNOWN {% endif %} {% endfor %} {% elif service_status_pgrep is defined %} {% for result in service_status_pgrep.results %} - {{ result.item }}: {{ 'RUNNING' if result.stdout == 'active' else 'STOPPED' }} {% endfor %} {% else %} - Service status not available {% endif %} ========================================== - name: Generate JSON health report copy: content: | { "timestamp": "{{ health_check_timestamp }}", "hostname": "{{ inventory_hostname }}", "overall_status": "{{ health_status.overall }}", "system": { "uptime": "{{ health_status.uptime }}", "cpu_usage": {{ health_status.cpu }}, "memory_usage": {{ health_status.memory }}, "disk_usage": {{ health_status.disk }}, "load_average": "{{ health_status.load }}" }, "connectivity": { "internet": "{{ health_status.internet }}", "tailscale": "{{ health_status.tailscale }}" }, "docker": "{{ docker_status.stdout | replace('\n', ' ') }}", "services": [ {% if ansible_service_mgr == "systemd" and service_status is defined %} {% set ns = namespace(first=true) %} {% for result in service_status.results %} {% if result.status is defined and result.status.ActiveState is defined %} {% if not ns.first %},{% endif %} { "name": "{{ result.item }}", "status": "{{ result.status.ActiveState }}", "enabled": {{ (result.status.UnitFileState | default('unknown')) == "enabled" }} } {% set ns.first = false %} {% endif %} {% endfor %} {% elif service_status_pgrep is defined %} {% set ns = namespace(first=true) %} {% for result in service_status_pgrep.results %} {% if not ns.first %},{% endif %} { "name": "{{ result.item }}", "status": "{{ result.stdout | default('unknown') }}", "enabled": null } {% set ns.first = false %} {% endfor %} {% endif %} ] } dest: "/tmp/health_reports/{{ inventory_hostname }}_health_{{ ansible_date_time.epoch }}.json" delegate_to: localhost - name: Send alert for critical status shell: | if command -v curl >/dev/null 2>&1; then curl -d "🚨 CRITICAL: {{ inventory_hostname }} health check failed - {{ health_status.overall }}" \ -H "Title: Homelab Health Alert" \ -H "Priority: urgent" \ -H "Tags: warning,health" \ "{{ ntfy_url | default('https://ntfy.sh/REDACTED_TOPIC') }}" || true fi when: health_status.overall == "CRITICAL" ignore_errors: yes - name: Summary message debug: msg: | 📋 Health check complete for {{ inventory_hostname }} 📊 Status: {{ health_status.overall }} 📄 Report saved to: /tmp/health_reports/{{ inventory_hostname }}_health_{{ ansible_date_time.epoch }}.json {% if health_status.overall == "CRITICAL" %} 🚨 CRITICAL issues detected - immediate attention required! {% elif health_status.overall == "WARNING" %} ⚠️ WARNING conditions detected - monitoring recommended {% else %} ✅ System is healthy {% endif %}