Files
homelab-optimized/ansible/automation/playbooks/alert_check.yml
Gitea Mirror Bot ac5a4ca940
Some checks failed
Documentation / Build Docusaurus (push) Failing after 5m3s
Documentation / Deploy to GitHub Pages (push) Has been skipped
Sanitized mirror from private repository - 2026-04-01 04:44:34 UTC
2026-04-01 04:44:34 +00:00

419 lines
15 KiB
YAML
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
---
# Alert Check and Notification Playbook
# Monitors system conditions and sends alerts when thresholds are exceeded
# Usage: ansible-playbook playbooks/alert_check.yml
# Usage: ansible-playbook playbooks/alert_check.yml -e "alert_mode=test"
- name: Infrastructure Alert Monitoring
hosts: all
gather_facts: yes
vars:
alert_config_dir: "/tmp/alerts"
default_alert_mode: "production" # production, test, silent
# Alert thresholds
thresholds:
cpu:
warning: 80
critical: 95
memory:
warning: 85
critical: 95
disk:
warning: 85
critical: 95
load:
warning: 4.0
critical: 8.0
container_down_critical: 1 # Number of containers down to trigger critical
# Notification settings
notifications:
ntfy_url: "{{ ntfy_url | default('https://ntfy.sh/REDACTED_TOPIC') }}"
email_enabled: "{{ email_enabled | default(false) }}"
slack_webhook: "{{ slack_webhook | default('') }}"
tasks:
- name: Create alert configuration directory
file:
path: "{{ alert_config_dir }}/{{ inventory_hostname }}"
state: directory
mode: '0755'
- name: Display alert monitoring plan
debug:
msg: |
🚨 ALERT MONITORING INITIATED
=============================
🖥️ Host: {{ inventory_hostname }}
📅 Date: {{ ansible_date_time.date }}
🔔 Mode: {{ alert_mode | default(default_alert_mode) }}
📊 CPU: {{ thresholds.cpu.warning }}%/{{ thresholds.cpu.critical }}%
💾 Memory: {{ thresholds.memory.warning }}%/{{ thresholds.memory.critical }}%
💿 Disk: {{ thresholds.disk.warning }}%/{{ thresholds.disk.critical }}%
⚖️ Load: {{ thresholds.load.warning }}/{{ thresholds.load.critical }}
- name: Check CPU usage with alerting
shell: |
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | awk -F'%' '{print $1}')
if [ -z "$cpu_usage" ]; then
cpu_usage=$(vmstat 1 2 | tail -1 | awk '{print 100-$15}')
fi
cpu_int=$(echo "$cpu_usage" | cut -d'.' -f1)
echo "🖥️ CPU Usage: ${cpu_usage}%"
if [ "$cpu_int" -gt "{{ thresholds.cpu.critical }}" ]; then
echo "CRITICAL:CPU:${cpu_usage}%"
exit 2
elif [ "$cpu_int" -gt "{{ thresholds.cpu.warning }}" ]; then
echo "WARNING:CPU:${cpu_usage}%"
exit 1
else
echo "OK:CPU:${cpu_usage}%"
exit 0
fi
register: cpu_alert
failed_when: false
- name: Check memory usage with alerting
shell: |
memory_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')
echo "💾 Memory Usage: ${memory_usage}%"
if [ "$memory_usage" -gt "{{ thresholds.memory.critical }}" ]; then
echo "CRITICAL:MEMORY:${memory_usage}%"
exit 2
elif [ "$memory_usage" -gt "{{ thresholds.memory.warning }}" ]; then
echo "WARNING:MEMORY:${memory_usage}%"
exit 1
else
echo "OK:MEMORY:${memory_usage}%"
exit 0
fi
register: memory_alert
failed_when: false
- name: Check disk usage with alerting
shell: |
critical_disks=""
warning_disks=""
echo "💿 Disk Usage Check:"
df -h | awk 'NR>1 {print $5 " " $6}' | while read output; do
usage=$(echo $output | awk '{print $1}' | sed 's/%//')
partition=$(echo $output | awk '{print $2}')
echo " $partition: ${usage}%"
if [ "$usage" -gt "{{ thresholds.disk.critical }}" ]; then
echo "CRITICAL:DISK:$partition:${usage}%"
echo "$partition:$usage" >> /tmp/critical_disks_$$
elif [ "$usage" -gt "{{ thresholds.disk.warning }}" ]; then
echo "WARNING:DISK:$partition:${usage}%"
echo "$partition:$usage" >> /tmp/warning_disks_$$
fi
done
if [ -f /tmp/critical_disks_$$ ]; then
echo "Critical disk alerts:"
cat /tmp/critical_disks_$$
rm -f /tmp/critical_disks_$$ /tmp/warning_disks_$$
exit 2
elif [ -f /tmp/warning_disks_$$ ]; then
echo "Disk warnings:"
cat /tmp/warning_disks_$$
rm -f /tmp/warning_disks_$$
exit 1
else
echo "OK:DISK:All partitions normal"
exit 0
fi
register: disk_alert
failed_when: false
- name: Check load average with alerting
shell: |
load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | sed 's/,//')
echo "⚖️ Load Average (1min): $load_avg"
# Use bc for floating point comparison if available, otherwise use awk
if command -v bc &> /dev/null; then
critical_check=$(echo "$load_avg > {{ thresholds.load.critical }}" | bc -l)
warning_check=$(echo "$load_avg > {{ thresholds.load.warning }}" | bc -l)
else
critical_check=$(awk "BEGIN {print ($load_avg > {{ thresholds.load.critical }})}")
warning_check=$(awk "BEGIN {print ($load_avg > {{ thresholds.load.warning }})}")
fi
if [ "$critical_check" = "1" ]; then
echo "CRITICAL:LOAD:${load_avg}"
exit 2
elif [ "$warning_check" = "1" ]; then
echo "WARNING:LOAD:${load_avg}"
exit 1
else
echo "OK:LOAD:${load_avg}"
exit 0
fi
register: load_alert
failed_when: false
- name: Check Docker container health
shell: |
if command -v docker &> /dev/null && docker info &> /dev/null; then
total_containers=$(docker ps -a -q | wc -l)
running_containers=$(docker ps -q | wc -l)
unhealthy_containers=$(docker ps --filter health=unhealthy -q | wc -l)
stopped_containers=$((total_containers - running_containers))
echo "🐳 Docker Container Status:"
echo " Total: $total_containers"
echo " Running: $running_containers"
echo " Stopped: $stopped_containers"
echo " Unhealthy: $unhealthy_containers"
if [ "$unhealthy_containers" -gt "0" ] || [ "$stopped_containers" -gt "{{ thresholds.container_down_critical }}" ]; then
echo "CRITICAL:DOCKER:$stopped_containers stopped, $unhealthy_containers unhealthy"
exit 2
elif [ "$stopped_containers" -gt "0" ]; then
echo "WARNING:DOCKER:$stopped_containers containers stopped"
exit 1
else
echo "OK:DOCKER:All containers healthy"
exit 0
fi
else
echo " Docker not available - skipping container checks"
echo "OK:DOCKER:Not installed"
exit 0
fi
register: docker_alert
failed_when: false
- name: Check critical services
shell: |
critical_services=("ssh" "systemd-resolved")
failed_services=""
echo "🔧 Critical Services Check:"
for service in "${critical_services[@]}"; do
if systemctl is-active --quiet "$service" 2>/dev/null; then
echo " ✅ $service: running"
else
echo " 🚨 $service: not running"
failed_services="$failed_services $service"
fi
done
if [ -n "$failed_services" ]; then
echo "CRITICAL:SERVICES:$failed_services"
exit 2
else
echo "OK:SERVICES:All critical services running"
exit 0
fi
register: services_alert
failed_when: false
- name: Check network connectivity
shell: |
echo "🌐 Network Connectivity Check:"
# Check internet connectivity
if ping -c 1 -W 5 8.8.8.8 &> /dev/null; then
echo " ✅ Internet: OK"
internet_status="OK"
else
echo " 🚨 Internet: FAILED"
internet_status="FAILED"
fi
# Check DNS resolution
if nslookup google.com &> /dev/null; then
echo " ✅ DNS: OK"
dns_status="OK"
else
echo " ⚠️ DNS: FAILED"
dns_status="FAILED"
fi
if [ "$internet_status" = "FAILED" ]; then
echo "CRITICAL:NETWORK:No internet connectivity"
exit 2
elif [ "$dns_status" = "FAILED" ]; then
echo "WARNING:NETWORK:DNS resolution issues"
exit 1
else
echo "OK:NETWORK:All connectivity normal"
exit 0
fi
register: network_alert
failed_when: false
- name: Evaluate overall alert status
set_fact:
alert_summary:
critical_count: >-
{{
[cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
| selectattr('rc', 'defined')
| selectattr('rc', 'equalto', 2)
| list
| length
}}
warning_count: >-
{{
[cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
| selectattr('rc', 'defined')
| selectattr('rc', 'equalto', 1)
| list
| length
}}
overall_status: >-
{{
'CRITICAL' if (
[cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
| selectattr('rc', 'defined')
| selectattr('rc', 'equalto', 2)
| list
| length > 0
) else 'WARNING' if (
[cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
| selectattr('rc', 'defined')
| selectattr('rc', 'equalto', 1)
| list
| length > 0
) else 'OK'
}}
- name: Generate alert report
shell: |
alert_file="{{ alert_config_dir }}/{{ inventory_hostname }}/alert_report_{{ ansible_date_time.epoch }}.txt"
echo "🚨 INFRASTRUCTURE ALERT REPORT" > "$alert_file"
echo "===============================" >> "$alert_file"
echo "Host: {{ inventory_hostname }}" >> "$alert_file"
echo "Date: {{ ansible_date_time.iso8601 }}" >> "$alert_file"
echo "Overall Status: {{ alert_summary.overall_status }}" >> "$alert_file"
echo "Critical Alerts: {{ alert_summary.critical_count }}" >> "$alert_file"
echo "Warning Alerts: {{ alert_summary.warning_count }}" >> "$alert_file"
echo "" >> "$alert_file"
echo "📊 DETAILED RESULTS:" >> "$alert_file"
echo "===================" >> "$alert_file"
{% for check in ['cpu_alert', 'memory_alert', 'disk_alert', 'load_alert', 'docker_alert', 'services_alert', 'network_alert'] %}
echo "" >> "$alert_file"
echo "{{ check | upper | replace('_ALERT', '') }}:" >> "$alert_file"
echo "{{ hostvars[inventory_hostname][check].stdout | default('No output') }}" >> "$alert_file"
{% endfor %}
echo "Alert report saved to: $alert_file"
register: alert_report
- name: Send NTFY notification for critical alerts
uri:
url: "{{ notifications.ntfy_url }}"
method: POST
body: |
🚨 CRITICAL ALERT: {{ inventory_hostname }}
Status: {{ alert_summary.overall_status }}
Critical: {{ alert_summary.critical_count }}
Warnings: {{ alert_summary.warning_count }}
Time: {{ ansible_date_time.iso8601 }}
headers:
Title: "Homelab Critical Alert"
Priority: "urgent"
Tags: "warning,critical,{{ inventory_hostname }}"
when:
- alert_summary.overall_status == "CRITICAL"
- alert_mode | default(default_alert_mode) != "silent"
- notifications.ntfy_url != ""
ignore_errors: yes
- name: Send NTFY notification for warning alerts
uri:
url: "{{ notifications.ntfy_url }}"
method: POST
body: |
⚠️ WARNING: {{ inventory_hostname }}
Status: {{ alert_summary.overall_status }}
Warnings: {{ alert_summary.warning_count }}
Time: {{ ansible_date_time.iso8601 }}
headers:
Title: "Homelab Warning"
Priority: "default"
Tags: "warning,{{ inventory_hostname }}"
when:
- alert_summary.overall_status == "WARNING"
- alert_mode | default(default_alert_mode) != "silent"
- notifications.ntfy_url != ""
ignore_errors: yes
- name: Send test notification
uri:
url: "{{ notifications.ntfy_url }}"
method: POST
body: |
🧪 TEST ALERT: {{ inventory_hostname }}
This is a test notification from the alert monitoring system.
Status: {{ alert_summary.overall_status }}
Time: {{ ansible_date_time.iso8601 }}
headers:
Title: "Homelab Alert Test"
Priority: "low"
Tags: "test,{{ inventory_hostname }}"
when:
- alert_mode | default(default_alert_mode) == "test"
- notifications.ntfy_url != ""
ignore_errors: yes
- name: Display alert summary
debug:
msg: |
🚨 ALERT MONITORING COMPLETE
============================
🖥️ Host: {{ inventory_hostname }}
📅 Date: {{ ansible_date_time.date }}
🔔 Mode: {{ alert_mode | default(default_alert_mode) }}
📊 ALERT SUMMARY:
Overall Status: {{ alert_summary.overall_status }}
Critical Alerts: {{ alert_summary.critical_count }}
Warning Alerts: {{ alert_summary.warning_count }}
📋 CHECK RESULTS:
{% for check in ['cpu_alert', 'memory_alert', 'disk_alert', 'load_alert', 'docker_alert', 'services_alert', 'network_alert'] %}
{{ check | replace('_alert', '') | upper }}: {{ 'CRITICAL' if hostvars[inventory_hostname][check].rc | default(0) == 2 else 'WARNING' if hostvars[inventory_hostname][check].rc | default(0) == 1 else 'OK' }}
{% endfor %}
{{ alert_report.stdout }}
🔍 Next Steps:
{% if alert_summary.overall_status == "CRITICAL" %}
- 🚨 IMMEDIATE ACTION REQUIRED
- Review critical alerts above
- Check system resources and services
{% elif alert_summary.overall_status == "WARNING" %}
- ⚠️ Monitor system closely
- Consider preventive maintenance
{% else %}
- ✅ System is healthy
- Continue regular monitoring
{% endif %}
- Schedule regular checks: crontab -e
- View full report: cat {{ alert_config_dir }}/{{ inventory_hostname }}/alert_report_*.txt
============================