247 lines
8.7 KiB
YAML
247 lines
8.7 KiB
YAML
---
|
|
- name: Comprehensive Health Check
|
|
hosts: all
|
|
gather_facts: yes
|
|
vars:
|
|
health_check_timestamp: "{{ ansible_date_time.iso8601 }}"
|
|
critical_services:
|
|
- docker
|
|
- ssh
|
|
- tailscaled
|
|
health_thresholds:
|
|
cpu_warning: 80
|
|
cpu_critical: 95
|
|
memory_warning: 85
|
|
memory_critical: 95
|
|
disk_warning: 85
|
|
disk_critical: 95
|
|
|
|
tasks:
|
|
- name: Create health check report directory
|
|
file:
|
|
path: "/tmp/health_reports"
|
|
state: directory
|
|
mode: '0755'
|
|
delegate_to: localhost
|
|
run_once: true
|
|
|
|
- name: Check system uptime
|
|
shell: uptime -p
|
|
register: system_uptime
|
|
changed_when: false
|
|
|
|
- name: Check CPU usage
|
|
shell: |
|
|
top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1 | cut -d',' -f1
|
|
register: cpu_usage
|
|
changed_when: false
|
|
|
|
- name: Check memory usage
|
|
shell: |
|
|
free | awk 'NR==2{printf "%.1f", $3*100/$2}'
|
|
register: memory_usage
|
|
changed_when: false
|
|
|
|
- name: Check disk usage
|
|
shell: |
|
|
df -h / | awk 'NR==2{print $5}' | sed 's/%//'
|
|
register: disk_usage
|
|
changed_when: false
|
|
|
|
- name: Check load average
|
|
shell: |
|
|
uptime | awk -F'load average:' '{print $2}' | sed 's/^ *//'
|
|
register: load_average
|
|
changed_when: false
|
|
|
|
- name: Check critical services (systemd hosts only)
|
|
systemd:
|
|
name: "{{ item }}"
|
|
register: service_status
|
|
loop: "{{ critical_services }}"
|
|
ignore_errors: yes
|
|
when: ansible_service_mgr == "systemd"
|
|
|
|
- name: Check critical services via pgrep (non-systemd hosts — Synology DSM etc.)
|
|
shell: "pgrep -x {{ item }} >/dev/null 2>&1 && echo 'active' || echo 'inactive'"
|
|
register: service_status_pgrep
|
|
loop: "{{ critical_services }}"
|
|
changed_when: false
|
|
ignore_errors: yes
|
|
when: ansible_service_mgr != "systemd"
|
|
|
|
- name: Check Docker containers (if Docker is running)
|
|
shell: |
|
|
if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
|
|
echo "Running: $(docker ps -q | wc -l)"
|
|
echo "Total: $(docker ps -aq | wc -l)"
|
|
echo "Unhealthy: $(docker ps --filter health=unhealthy -q | wc -l)"
|
|
else
|
|
echo "Docker not available"
|
|
fi
|
|
register: docker_status
|
|
changed_when: false
|
|
ignore_errors: yes
|
|
|
|
- name: Check network connectivity
|
|
shell: |
|
|
ping -c 1 8.8.8.8 >/dev/null 2>&1 && echo "OK" || echo "FAILED"
|
|
register: internet_check
|
|
changed_when: false
|
|
|
|
- name: Check Tailscale status
|
|
shell: |
|
|
if command -v tailscale >/dev/null 2>&1; then
|
|
tailscale status --json | jq -r '.Self.Online' 2>/dev/null || echo "unknown"
|
|
else
|
|
echo "not_installed"
|
|
fi
|
|
register: tailscale_status
|
|
changed_when: false
|
|
ignore_errors: yes
|
|
|
|
- name: Evaluate health status
|
|
set_fact:
|
|
health_status:
|
|
overall: >-
|
|
{{
|
|
'CRITICAL' if (
|
|
(cpu_usage.stdout | float > health_thresholds.cpu_critical) or
|
|
(memory_usage.stdout | float > health_thresholds.memory_critical) or
|
|
(disk_usage.stdout | int > health_thresholds.disk_critical) or
|
|
(internet_check.stdout == "FAILED")
|
|
) else 'WARNING' if (
|
|
(cpu_usage.stdout | float > health_thresholds.cpu_warning) or
|
|
(memory_usage.stdout | float > health_thresholds.memory_warning) or
|
|
(disk_usage.stdout | int > health_thresholds.disk_warning)
|
|
) else 'HEALTHY'
|
|
}}
|
|
cpu: "{{ cpu_usage.stdout | float }}"
|
|
memory: "{{ memory_usage.stdout | float }}"
|
|
disk: "{{ disk_usage.stdout | int }}"
|
|
uptime: "{{ system_uptime.stdout }}"
|
|
load: "{{ load_average.stdout }}"
|
|
internet: "{{ internet_check.stdout }}"
|
|
tailscale: "{{ tailscale_status.stdout }}"
|
|
|
|
- name: Display health report
|
|
debug:
|
|
msg: |
|
|
|
|
==========================================
|
|
🏥 HEALTH CHECK REPORT - {{ inventory_hostname }}
|
|
==========================================
|
|
|
|
📊 OVERALL STATUS: {{ health_status.overall }}
|
|
|
|
🖥️ SYSTEM METRICS:
|
|
- Uptime: {{ health_status.uptime }}
|
|
- CPU Usage: {{ health_status.cpu }}%
|
|
- Memory Usage: {{ health_status.memory }}%
|
|
- Disk Usage: {{ health_status.disk }}%
|
|
- Load Average: {{ health_status.load }}
|
|
|
|
🌐 CONNECTIVITY:
|
|
- Internet: {{ health_status.internet }}
|
|
- Tailscale: {{ health_status.tailscale }}
|
|
|
|
🐳 DOCKER STATUS:
|
|
{{ docker_status.stdout }}
|
|
|
|
🔧 CRITICAL SERVICES:
|
|
{% if ansible_service_mgr == "systemd" and service_status is defined %}
|
|
{% for result in service_status.results %}
|
|
{% if result.status is defined and result.status.ActiveState is defined %}
|
|
- {{ result.item }}: {{ 'RUNNING' if result.status.ActiveState == 'active' else 'STOPPED' }}
|
|
{% elif not result.skipped | default(false) %}
|
|
- {{ result.item }}: UNKNOWN
|
|
{% endif %}
|
|
{% endfor %}
|
|
{% elif service_status_pgrep is defined %}
|
|
{% for result in service_status_pgrep.results %}
|
|
- {{ result.item }}: {{ 'RUNNING' if result.stdout == 'active' else 'STOPPED' }}
|
|
{% endfor %}
|
|
{% else %}
|
|
- Service status not available
|
|
{% endif %}
|
|
|
|
==========================================
|
|
|
|
- name: Generate JSON health report
|
|
copy:
|
|
content: |
|
|
{
|
|
"timestamp": "{{ health_check_timestamp }}",
|
|
"hostname": "{{ inventory_hostname }}",
|
|
"overall_status": "{{ health_status.overall }}",
|
|
"system": {
|
|
"uptime": "{{ health_status.uptime }}",
|
|
"cpu_usage": {{ health_status.cpu }},
|
|
"memory_usage": {{ health_status.memory }},
|
|
"disk_usage": {{ health_status.disk }},
|
|
"load_average": "{{ health_status.load }}"
|
|
},
|
|
"connectivity": {
|
|
"internet": "{{ health_status.internet }}",
|
|
"tailscale": "{{ health_status.tailscale }}"
|
|
},
|
|
"docker": "{{ docker_status.stdout | replace('\n', ' ') }}",
|
|
"services": [
|
|
{% if ansible_service_mgr == "systemd" and service_status is defined %}
|
|
{% set ns = namespace(first=true) %}
|
|
{% for result in service_status.results %}
|
|
{% if result.status is defined and result.status.ActiveState is defined %}
|
|
{% if not ns.first %},{% endif %}
|
|
{
|
|
"name": "{{ result.item }}",
|
|
"status": "{{ result.status.ActiveState }}",
|
|
"enabled": {{ (result.status.UnitFileState | default('unknown')) == "enabled" }}
|
|
}
|
|
{% set ns.first = false %}
|
|
{% endif %}
|
|
{% endfor %}
|
|
{% elif service_status_pgrep is defined %}
|
|
{% set ns = namespace(first=true) %}
|
|
{% for result in service_status_pgrep.results %}
|
|
{% if not ns.first %},{% endif %}
|
|
{
|
|
"name": "{{ result.item }}",
|
|
"status": "{{ result.stdout | default('unknown') }}",
|
|
"enabled": null
|
|
}
|
|
{% set ns.first = false %}
|
|
{% endfor %}
|
|
{% endif %}
|
|
]
|
|
}
|
|
dest: "/tmp/health_reports/{{ inventory_hostname }}_health_{{ ansible_date_time.epoch }}.json"
|
|
delegate_to: localhost
|
|
|
|
- name: Send alert for critical status
|
|
shell: |
|
|
if command -v curl >/dev/null 2>&1; then
|
|
curl -d "🚨 CRITICAL: {{ inventory_hostname }} health check failed - {{ health_status.overall }}" \
|
|
-H "Title: Homelab Health Alert" \
|
|
-H "Priority: urgent" \
|
|
-H "Tags: warning,health" \
|
|
"{{ ntfy_url | default('https://ntfy.sh/REDACTED_TOPIC') }}" || true
|
|
fi
|
|
when: health_status.overall == "CRITICAL"
|
|
ignore_errors: yes
|
|
|
|
- name: Summary message
|
|
debug:
|
|
msg: |
|
|
|
|
📋 Health check complete for {{ inventory_hostname }}
|
|
📊 Status: {{ health_status.overall }}
|
|
📄 Report saved to: /tmp/health_reports/{{ inventory_hostname }}_health_{{ ansible_date_time.epoch }}.json
|
|
|
|
{% if health_status.overall == "CRITICAL" %}
|
|
🚨 CRITICAL issues detected - immediate attention required!
|
|
{% elif health_status.overall == "WARNING" %}
|
|
⚠️ WARNING conditions detected - monitoring recommended
|
|
{% else %}
|
|
✅ System is healthy
|
|
{% endif %}
|