Files
homelab-optimized/ansible/automation/playbooks/health_check.yml
Gitea Mirror Bot 24f1036b45
Some checks failed
Documentation / Deploy to GitHub Pages (push) Has been cancelled
Documentation / Build Docusaurus (push) Has been cancelled
Sanitized mirror from private repository - 2026-04-16 07:04:43 UTC
2026-04-16 07:04:43 +00:00

247 lines
8.7 KiB
YAML

---
- name: Comprehensive Health Check
hosts: all
gather_facts: yes
vars:
health_check_timestamp: "{{ ansible_date_time.iso8601 }}"
critical_services:
- docker
- ssh
- tailscaled
health_thresholds:
cpu_warning: 80
cpu_critical: 95
memory_warning: 85
memory_critical: 95
disk_warning: 85
disk_critical: 95
tasks:
- name: Create health check report directory
file:
path: "/tmp/health_reports"
state: directory
mode: '0755'
delegate_to: localhost
run_once: true
- name: Check system uptime
shell: uptime -p
register: system_uptime
changed_when: false
- name: Check CPU usage
shell: |
top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1 | cut -d',' -f1
register: cpu_usage
changed_when: false
- name: Check memory usage
shell: |
free | awk 'NR==2{printf "%.1f", $3*100/$2}'
register: memory_usage
changed_when: false
- name: Check disk usage
shell: |
df -h / | awk 'NR==2{print $5}' | sed 's/%//'
register: disk_usage
changed_when: false
- name: Check load average
shell: |
uptime | awk -F'load average:' '{print $2}' | sed 's/^ *//'
register: load_average
changed_when: false
- name: Check critical services (systemd hosts only)
systemd:
name: "{{ item }}"
register: service_status
loop: "{{ critical_services }}"
ignore_errors: yes
when: ansible_service_mgr == "systemd"
- name: Check critical services via pgrep (non-systemd hosts — Synology DSM etc.)
shell: "pgrep -x {{ item }} >/dev/null 2>&1 && echo 'active' || echo 'inactive'"
register: service_status_pgrep
loop: "{{ critical_services }}"
changed_when: false
ignore_errors: yes
when: ansible_service_mgr != "systemd"
- name: Check Docker containers (if Docker is running)
shell: |
if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
echo "Running: $(docker ps -q | wc -l)"
echo "Total: $(docker ps -aq | wc -l)"
echo "Unhealthy: $(docker ps --filter health=unhealthy -q | wc -l)"
else
echo "Docker not available"
fi
register: docker_status
changed_when: false
ignore_errors: yes
- name: Check network connectivity
shell: |
ping -c 1 8.8.8.8 >/dev/null 2>&1 && echo "OK" || echo "FAILED"
register: internet_check
changed_when: false
- name: Check Tailscale status
shell: |
if command -v tailscale >/dev/null 2>&1; then
tailscale status --json | jq -r '.Self.Online' 2>/dev/null || echo "unknown"
else
echo "not_installed"
fi
register: tailscale_status
changed_when: false
ignore_errors: yes
- name: Evaluate health status
set_fact:
health_status:
overall: >-
{{
'CRITICAL' if (
(cpu_usage.stdout | float > health_thresholds.cpu_critical) or
(memory_usage.stdout | float > health_thresholds.memory_critical) or
(disk_usage.stdout | int > health_thresholds.disk_critical) or
(internet_check.stdout == "FAILED")
) else 'WARNING' if (
(cpu_usage.stdout | float > health_thresholds.cpu_warning) or
(memory_usage.stdout | float > health_thresholds.memory_warning) or
(disk_usage.stdout | int > health_thresholds.disk_warning)
) else 'HEALTHY'
}}
cpu: "{{ cpu_usage.stdout | float }}"
memory: "{{ memory_usage.stdout | float }}"
disk: "{{ disk_usage.stdout | int }}"
uptime: "{{ system_uptime.stdout }}"
load: "{{ load_average.stdout }}"
internet: "{{ internet_check.stdout }}"
tailscale: "{{ tailscale_status.stdout }}"
- name: Display health report
debug:
msg: |
==========================================
🏥 HEALTH CHECK REPORT - {{ inventory_hostname }}
==========================================
📊 OVERALL STATUS: {{ health_status.overall }}
🖥️ SYSTEM METRICS:
- Uptime: {{ health_status.uptime }}
- CPU Usage: {{ health_status.cpu }}%
- Memory Usage: {{ health_status.memory }}%
- Disk Usage: {{ health_status.disk }}%
- Load Average: {{ health_status.load }}
🌐 CONNECTIVITY:
- Internet: {{ health_status.internet }}
- Tailscale: {{ health_status.tailscale }}
🐳 DOCKER STATUS:
{{ docker_status.stdout }}
🔧 CRITICAL SERVICES:
{% if ansible_service_mgr == "systemd" and service_status is defined %}
{% for result in service_status.results %}
{% if result.status is defined and result.status.ActiveState is defined %}
- {{ result.item }}: {{ 'RUNNING' if result.status.ActiveState == 'active' else 'STOPPED' }}
{% elif not result.skipped | default(false) %}
- {{ result.item }}: UNKNOWN
{% endif %}
{% endfor %}
{% elif service_status_pgrep is defined %}
{% for result in service_status_pgrep.results %}
- {{ result.item }}: {{ 'RUNNING' if result.stdout == 'active' else 'STOPPED' }}
{% endfor %}
{% else %}
- Service status not available
{% endif %}
==========================================
- name: Generate JSON health report
copy:
content: |
{
"timestamp": "{{ health_check_timestamp }}",
"hostname": "{{ inventory_hostname }}",
"overall_status": "{{ health_status.overall }}",
"system": {
"uptime": "{{ health_status.uptime }}",
"cpu_usage": {{ health_status.cpu }},
"memory_usage": {{ health_status.memory }},
"disk_usage": {{ health_status.disk }},
"load_average": "{{ health_status.load }}"
},
"connectivity": {
"internet": "{{ health_status.internet }}",
"tailscale": "{{ health_status.tailscale }}"
},
"docker": "{{ docker_status.stdout | replace('\n', ' ') }}",
"services": [
{% if ansible_service_mgr == "systemd" and service_status is defined %}
{% set ns = namespace(first=true) %}
{% for result in service_status.results %}
{% if result.status is defined and result.status.ActiveState is defined %}
{% if not ns.first %},{% endif %}
{
"name": "{{ result.item }}",
"status": "{{ result.status.ActiveState }}",
"enabled": {{ (result.status.UnitFileState | default('unknown')) == "enabled" }}
}
{% set ns.first = false %}
{% endif %}
{% endfor %}
{% elif service_status_pgrep is defined %}
{% set ns = namespace(first=true) %}
{% for result in service_status_pgrep.results %}
{% if not ns.first %},{% endif %}
{
"name": "{{ result.item }}",
"status": "{{ result.stdout | default('unknown') }}",
"enabled": null
}
{% set ns.first = false %}
{% endfor %}
{% endif %}
]
}
dest: "/tmp/health_reports/{{ inventory_hostname }}_health_{{ ansible_date_time.epoch }}.json"
delegate_to: localhost
- name: Send alert for critical status
shell: |
if command -v curl >/dev/null 2>&1; then
curl -d "🚨 CRITICAL: {{ inventory_hostname }} health check failed - {{ health_status.overall }}" \
-H "Title: Homelab Health Alert" \
-H "Priority: urgent" \
-H "Tags: warning,health" \
"{{ ntfy_url | default('https://ntfy.sh/REDACTED_TOPIC') }}" || true
fi
when: health_status.overall == "CRITICAL"
ignore_errors: yes
- name: Summary message
debug:
msg: |
📋 Health check complete for {{ inventory_hostname }}
📊 Status: {{ health_status.overall }}
📄 Report saved to: /tmp/health_reports/{{ inventory_hostname }}_health_{{ ansible_date_time.epoch }}.json
{% if health_status.overall == "CRITICAL" %}
🚨 CRITICAL issues detected - immediate attention required!
{% elif health_status.overall == "WARNING" %}
⚠️ WARNING conditions detected - monitoring recommended
{% else %}
✅ System is healthy
{% endif %}