Sanitized mirror from private repository - 2026-04-04 03:48:45 UTC
This commit is contained in:
246
ansible/automation/playbooks/health_check.yml
Normal file
246
ansible/automation/playbooks/health_check.yml
Normal file
@@ -0,0 +1,246 @@
|
||||
---
|
||||
- name: Comprehensive Health Check
|
||||
hosts: all
|
||||
gather_facts: yes
|
||||
vars:
|
||||
health_check_timestamp: "{{ ansible_date_time.iso8601 }}"
|
||||
critical_services:
|
||||
- docker
|
||||
- ssh
|
||||
- tailscaled
|
||||
health_thresholds:
|
||||
cpu_warning: 80
|
||||
cpu_critical: 95
|
||||
memory_warning: 85
|
||||
memory_critical: 95
|
||||
disk_warning: 85
|
||||
disk_critical: 95
|
||||
|
||||
tasks:
|
||||
- name: Create health check report directory
|
||||
file:
|
||||
path: "/tmp/health_reports"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
delegate_to: localhost
|
||||
run_once: true
|
||||
|
||||
- name: Check system uptime
|
||||
shell: uptime -p
|
||||
register: system_uptime
|
||||
changed_when: false
|
||||
|
||||
- name: Check CPU usage
|
||||
shell: |
|
||||
top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1 | cut -d',' -f1
|
||||
register: cpu_usage
|
||||
changed_when: false
|
||||
|
||||
- name: Check memory usage
|
||||
shell: |
|
||||
free | awk 'NR==2{printf "%.1f", $3*100/$2}'
|
||||
register: memory_usage
|
||||
changed_when: false
|
||||
|
||||
- name: Check disk usage
|
||||
shell: |
|
||||
df -h / | awk 'NR==2{print $5}' | sed 's/%//'
|
||||
register: disk_usage
|
||||
changed_when: false
|
||||
|
||||
- name: Check load average
|
||||
shell: |
|
||||
uptime | awk -F'load average:' '{print $2}' | sed 's/^ *//'
|
||||
register: load_average
|
||||
changed_when: false
|
||||
|
||||
- name: Check critical services (systemd hosts only)
|
||||
systemd:
|
||||
name: "{{ item }}"
|
||||
register: service_status
|
||||
loop: "{{ critical_services }}"
|
||||
ignore_errors: yes
|
||||
when: ansible_service_mgr == "systemd"
|
||||
|
||||
- name: Check critical services via pgrep (non-systemd hosts — Synology DSM etc.)
|
||||
shell: "pgrep -x {{ item }} >/dev/null 2>&1 && echo 'active' || echo 'inactive'"
|
||||
register: service_status_pgrep
|
||||
loop: "{{ critical_services }}"
|
||||
changed_when: false
|
||||
ignore_errors: yes
|
||||
when: ansible_service_mgr != "systemd"
|
||||
|
||||
- name: Check Docker containers (if Docker is running)
|
||||
shell: |
|
||||
if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
|
||||
echo "Running: $(docker ps -q | wc -l)"
|
||||
echo "Total: $(docker ps -aq | wc -l)"
|
||||
echo "Unhealthy: $(docker ps --filter health=unhealthy -q | wc -l)"
|
||||
else
|
||||
echo "Docker not available"
|
||||
fi
|
||||
register: docker_status
|
||||
changed_when: false
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Check network connectivity
|
||||
shell: |
|
||||
ping -c 1 8.8.8.8 >/dev/null 2>&1 && echo "OK" || echo "FAILED"
|
||||
register: internet_check
|
||||
changed_when: false
|
||||
|
||||
- name: Check Tailscale status
|
||||
shell: |
|
||||
if command -v tailscale >/dev/null 2>&1; then
|
||||
tailscale status --json | jq -r '.Self.Online' 2>/dev/null || echo "unknown"
|
||||
else
|
||||
echo "not_installed"
|
||||
fi
|
||||
register: tailscale_status
|
||||
changed_when: false
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Evaluate health status
|
||||
set_fact:
|
||||
health_status:
|
||||
overall: >-
|
||||
{{
|
||||
'CRITICAL' if (
|
||||
(cpu_usage.stdout | float > health_thresholds.cpu_critical) or
|
||||
(memory_usage.stdout | float > health_thresholds.memory_critical) or
|
||||
(disk_usage.stdout | int > health_thresholds.disk_critical) or
|
||||
(internet_check.stdout == "FAILED")
|
||||
) else 'WARNING' if (
|
||||
(cpu_usage.stdout | float > health_thresholds.cpu_warning) or
|
||||
(memory_usage.stdout | float > health_thresholds.memory_warning) or
|
||||
(disk_usage.stdout | int > health_thresholds.disk_warning)
|
||||
) else 'HEALTHY'
|
||||
}}
|
||||
cpu: "{{ cpu_usage.stdout | float }}"
|
||||
memory: "{{ memory_usage.stdout | float }}"
|
||||
disk: "{{ disk_usage.stdout | int }}"
|
||||
uptime: "{{ system_uptime.stdout }}"
|
||||
load: "{{ load_average.stdout }}"
|
||||
internet: "{{ internet_check.stdout }}"
|
||||
tailscale: "{{ tailscale_status.stdout }}"
|
||||
|
||||
- name: Display health report
|
||||
debug:
|
||||
msg: |
|
||||
|
||||
==========================================
|
||||
🏥 HEALTH CHECK REPORT - {{ inventory_hostname }}
|
||||
==========================================
|
||||
|
||||
📊 OVERALL STATUS: {{ health_status.overall }}
|
||||
|
||||
🖥️ SYSTEM METRICS:
|
||||
- Uptime: {{ health_status.uptime }}
|
||||
- CPU Usage: {{ health_status.cpu }}%
|
||||
- Memory Usage: {{ health_status.memory }}%
|
||||
- Disk Usage: {{ health_status.disk }}%
|
||||
- Load Average: {{ health_status.load }}
|
||||
|
||||
🌐 CONNECTIVITY:
|
||||
- Internet: {{ health_status.internet }}
|
||||
- Tailscale: {{ health_status.tailscale }}
|
||||
|
||||
🐳 DOCKER STATUS:
|
||||
{{ docker_status.stdout }}
|
||||
|
||||
🔧 CRITICAL SERVICES:
|
||||
{% if ansible_service_mgr == "systemd" and service_status is defined %}
|
||||
{% for result in service_status.results %}
|
||||
{% if result.status is defined and result.status.ActiveState is defined %}
|
||||
- {{ result.item }}: {{ 'RUNNING' if result.status.ActiveState == 'active' else 'STOPPED' }}
|
||||
{% elif not result.skipped | default(false) %}
|
||||
- {{ result.item }}: UNKNOWN
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% elif service_status_pgrep is defined %}
|
||||
{% for result in service_status_pgrep.results %}
|
||||
- {{ result.item }}: {{ 'RUNNING' if result.stdout == 'active' else 'STOPPED' }}
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
- Service status not available
|
||||
{% endif %}
|
||||
|
||||
==========================================
|
||||
|
||||
- name: Generate JSON health report
|
||||
copy:
|
||||
content: |
|
||||
{
|
||||
"timestamp": "{{ health_check_timestamp }}",
|
||||
"hostname": "{{ inventory_hostname }}",
|
||||
"overall_status": "{{ health_status.overall }}",
|
||||
"system": {
|
||||
"uptime": "{{ health_status.uptime }}",
|
||||
"cpu_usage": {{ health_status.cpu }},
|
||||
"memory_usage": {{ health_status.memory }},
|
||||
"disk_usage": {{ health_status.disk }},
|
||||
"load_average": "{{ health_status.load }}"
|
||||
},
|
||||
"connectivity": {
|
||||
"internet": "{{ health_status.internet }}",
|
||||
"tailscale": "{{ health_status.tailscale }}"
|
||||
},
|
||||
"docker": "{{ docker_status.stdout | replace('\n', ' ') }}",
|
||||
"services": [
|
||||
{% if ansible_service_mgr == "systemd" and service_status is defined %}
|
||||
{% set ns = namespace(first=true) %}
|
||||
{% for result in service_status.results %}
|
||||
{% if result.status is defined and result.status.ActiveState is defined %}
|
||||
{% if not ns.first %},{% endif %}
|
||||
{
|
||||
"name": "{{ result.item }}",
|
||||
"status": "{{ result.status.ActiveState }}",
|
||||
"enabled": {{ (result.status.UnitFileState | default('unknown')) == "enabled" }}
|
||||
}
|
||||
{% set ns.first = false %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% elif service_status_pgrep is defined %}
|
||||
{% set ns = namespace(first=true) %}
|
||||
{% for result in service_status_pgrep.results %}
|
||||
{% if not ns.first %},{% endif %}
|
||||
{
|
||||
"name": "{{ result.item }}",
|
||||
"status": "{{ result.stdout | default('unknown') }}",
|
||||
"enabled": null
|
||||
}
|
||||
{% set ns.first = false %}
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
]
|
||||
}
|
||||
dest: "/tmp/health_reports/{{ inventory_hostname }}_health_{{ ansible_date_time.epoch }}.json"
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Send alert for critical status
|
||||
shell: |
|
||||
if command -v curl >/dev/null 2>&1; then
|
||||
curl -d "🚨 CRITICAL: {{ inventory_hostname }} health check failed - {{ health_status.overall }}" \
|
||||
-H "Title: Homelab Health Alert" \
|
||||
-H "Priority: urgent" \
|
||||
-H "Tags: warning,health" \
|
||||
"{{ ntfy_url | default('https://ntfy.sh/REDACTED_TOPIC') }}" || true
|
||||
fi
|
||||
when: health_status.overall == "CRITICAL"
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Summary message
|
||||
debug:
|
||||
msg: |
|
||||
|
||||
📋 Health check complete for {{ inventory_hostname }}
|
||||
📊 Status: {{ health_status.overall }}
|
||||
📄 Report saved to: /tmp/health_reports/{{ inventory_hostname }}_health_{{ ansible_date_time.epoch }}.json
|
||||
|
||||
{% if health_status.overall == "CRITICAL" %}
|
||||
🚨 CRITICAL issues detected - immediate attention required!
|
||||
{% elif health_status.overall == "WARNING" %}
|
||||
⚠️ WARNING conditions detected - monitoring recommended
|
||||
{% else %}
|
||||
✅ System is healthy
|
||||
{% endif %}
|
||||
Reference in New Issue
Block a user