--- # Alert Check and Notification Playbook # Monitors system conditions and sends alerts when thresholds are exceeded # Usage: ansible-playbook playbooks/alert_check.yml # Usage: ansible-playbook playbooks/alert_check.yml -e "alert_mode=test" - name: Infrastructure Alert Monitoring hosts: all gather_facts: yes vars: alert_config_dir: "/tmp/alerts" default_alert_mode: "production" # production, test, silent # Alert thresholds thresholds: cpu: warning: 80 critical: 95 memory: warning: 85 critical: 95 disk: warning: 85 critical: 95 load: warning: 4.0 critical: 8.0 container_down_critical: 1 # Number of containers down to trigger critical # Notification settings notifications: ntfy_url: "{{ ntfy_url | default('https://ntfy.sh/REDACTED_TOPIC') }}" email_enabled: "{{ email_enabled | default(false) }}" slack_webhook: "{{ slack_webhook | default('') }}" tasks: - name: Create alert configuration directory file: path: "{{ alert_config_dir }}/{{ inventory_hostname }}" state: directory mode: '0755' - name: Display alert monitoring plan debug: msg: | ๐Ÿšจ ALERT MONITORING INITIATED ============================= ๐Ÿ–ฅ๏ธ Host: {{ inventory_hostname }} ๐Ÿ“… Date: {{ ansible_date_time.date }} ๐Ÿ”” Mode: {{ alert_mode | default(default_alert_mode) }} ๐Ÿ“Š CPU: {{ thresholds.cpu.warning }}%/{{ thresholds.cpu.critical }}% ๐Ÿ’พ Memory: {{ thresholds.memory.warning }}%/{{ thresholds.memory.critical }}% ๐Ÿ’ฟ Disk: {{ thresholds.disk.warning }}%/{{ thresholds.disk.critical }}% โš–๏ธ Load: {{ thresholds.load.warning }}/{{ thresholds.load.critical }} - name: Check CPU usage with alerting shell: | cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | awk -F'%' '{print $1}') if [ -z "$cpu_usage" ]; then cpu_usage=$(vmstat 1 2 | tail -1 | awk '{print 100-$15}') fi cpu_int=$(echo "$cpu_usage" | cut -d'.' -f1) echo "๐Ÿ–ฅ๏ธ CPU Usage: ${cpu_usage}%" if [ "$cpu_int" -gt "{{ thresholds.cpu.critical }}" ]; then echo "CRITICAL:CPU:${cpu_usage}%" exit 2 elif [ "$cpu_int" -gt "{{ thresholds.cpu.warning }}" ]; then echo "WARNING:CPU:${cpu_usage}%" exit 1 else echo "OK:CPU:${cpu_usage}%" exit 0 fi register: cpu_alert failed_when: false - name: Check memory usage with alerting shell: | memory_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}') echo "๐Ÿ’พ Memory Usage: ${memory_usage}%" if [ "$memory_usage" -gt "{{ thresholds.memory.critical }}" ]; then echo "CRITICAL:MEMORY:${memory_usage}%" exit 2 elif [ "$memory_usage" -gt "{{ thresholds.memory.warning }}" ]; then echo "WARNING:MEMORY:${memory_usage}%" exit 1 else echo "OK:MEMORY:${memory_usage}%" exit 0 fi register: memory_alert failed_when: false - name: Check disk usage with alerting shell: | critical_disks="" warning_disks="" echo "๐Ÿ’ฟ Disk Usage Check:" df -h | awk 'NR>1 {print $5 " " $6}' | while read output; do usage=$(echo $output | awk '{print $1}' | sed 's/%//') partition=$(echo $output | awk '{print $2}') echo " $partition: ${usage}%" if [ "$usage" -gt "{{ thresholds.disk.critical }}" ]; then echo "CRITICAL:DISK:$partition:${usage}%" echo "$partition:$usage" >> /tmp/critical_disks_$$ elif [ "$usage" -gt "{{ thresholds.disk.warning }}" ]; then echo "WARNING:DISK:$partition:${usage}%" echo "$partition:$usage" >> /tmp/warning_disks_$$ fi done if [ -f /tmp/critical_disks_$$ ]; then echo "Critical disk alerts:" cat /tmp/critical_disks_$$ rm -f /tmp/critical_disks_$$ /tmp/warning_disks_$$ exit 2 elif [ -f /tmp/warning_disks_$$ ]; then echo "Disk warnings:" cat /tmp/warning_disks_$$ rm -f /tmp/warning_disks_$$ exit 1 else echo "OK:DISK:All partitions normal" exit 0 fi register: disk_alert failed_when: false - name: Check load average with alerting shell: | load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | sed 's/,//') echo "โš–๏ธ Load Average (1min): $load_avg" # Use bc for floating point comparison if available, otherwise use awk if command -v bc &> /dev/null; then critical_check=$(echo "$load_avg > {{ thresholds.load.critical }}" | bc -l) warning_check=$(echo "$load_avg > {{ thresholds.load.warning }}" | bc -l) else critical_check=$(awk "BEGIN {print ($load_avg > {{ thresholds.load.critical }})}") warning_check=$(awk "BEGIN {print ($load_avg > {{ thresholds.load.warning }})}") fi if [ "$critical_check" = "1" ]; then echo "CRITICAL:LOAD:${load_avg}" exit 2 elif [ "$warning_check" = "1" ]; then echo "WARNING:LOAD:${load_avg}" exit 1 else echo "OK:LOAD:${load_avg}" exit 0 fi register: load_alert failed_when: false - name: Check Docker container health shell: | if command -v docker &> /dev/null && docker info &> /dev/null; then total_containers=$(docker ps -a -q | wc -l) running_containers=$(docker ps -q | wc -l) unhealthy_containers=$(docker ps --filter health=unhealthy -q | wc -l) stopped_containers=$((total_containers - running_containers)) echo "๐Ÿณ Docker Container Status:" echo " Total: $total_containers" echo " Running: $running_containers" echo " Stopped: $stopped_containers" echo " Unhealthy: $unhealthy_containers" if [ "$unhealthy_containers" -gt "0" ] || [ "$stopped_containers" -gt "{{ thresholds.container_down_critical }}" ]; then echo "CRITICAL:DOCKER:$stopped_containers stopped, $unhealthy_containers unhealthy" exit 2 elif [ "$stopped_containers" -gt "0" ]; then echo "WARNING:DOCKER:$stopped_containers containers stopped" exit 1 else echo "OK:DOCKER:All containers healthy" exit 0 fi else echo "โ„น๏ธ Docker not available - skipping container checks" echo "OK:DOCKER:Not installed" exit 0 fi register: docker_alert failed_when: false - name: Check critical services shell: | critical_services=("ssh" "systemd-resolved") failed_services="" echo "๐Ÿ”ง Critical Services Check:" for service in "${critical_services[@]}"; do if systemctl is-active --quiet "$service" 2>/dev/null; then echo " โœ… $service: running" else echo " ๐Ÿšจ $service: not running" failed_services="$failed_services $service" fi done if [ -n "$failed_services" ]; then echo "CRITICAL:SERVICES:$failed_services" exit 2 else echo "OK:SERVICES:All critical services running" exit 0 fi register: services_alert failed_when: false - name: Check network connectivity shell: | echo "๐ŸŒ Network Connectivity Check:" # Check internet connectivity if ping -c 1 -W 5 8.8.8.8 &> /dev/null; then echo " โœ… Internet: OK" internet_status="OK" else echo " ๐Ÿšจ Internet: FAILED" internet_status="FAILED" fi # Check DNS resolution if nslookup google.com &> /dev/null; then echo " โœ… DNS: OK" dns_status="OK" else echo " โš ๏ธ DNS: FAILED" dns_status="FAILED" fi if [ "$internet_status" = "FAILED" ]; then echo "CRITICAL:NETWORK:No internet connectivity" exit 2 elif [ "$dns_status" = "FAILED" ]; then echo "WARNING:NETWORK:DNS resolution issues" exit 1 else echo "OK:NETWORK:All connectivity normal" exit 0 fi register: network_alert failed_when: false - name: Evaluate overall alert status set_fact: alert_summary: critical_count: >- {{ [cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert] | selectattr('rc', 'defined') | selectattr('rc', 'equalto', 2) | list | length }} warning_count: >- {{ [cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert] | selectattr('rc', 'defined') | selectattr('rc', 'equalto', 1) | list | length }} overall_status: >- {{ 'CRITICAL' if ( [cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert] | selectattr('rc', 'defined') | selectattr('rc', 'equalto', 2) | list | length > 0 ) else 'WARNING' if ( [cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert] | selectattr('rc', 'defined') | selectattr('rc', 'equalto', 1) | list | length > 0 ) else 'OK' }} - name: Generate alert report shell: | alert_file="{{ alert_config_dir }}/{{ inventory_hostname }}/alert_report_{{ ansible_date_time.epoch }}.txt" echo "๐Ÿšจ INFRASTRUCTURE ALERT REPORT" > "$alert_file" echo "===============================" >> "$alert_file" echo "Host: {{ inventory_hostname }}" >> "$alert_file" echo "Date: {{ ansible_date_time.iso8601 }}" >> "$alert_file" echo "Overall Status: {{ alert_summary.overall_status }}" >> "$alert_file" echo "Critical Alerts: {{ alert_summary.critical_count }}" >> "$alert_file" echo "Warning Alerts: {{ alert_summary.warning_count }}" >> "$alert_file" echo "" >> "$alert_file" echo "๐Ÿ“Š DETAILED RESULTS:" >> "$alert_file" echo "===================" >> "$alert_file" {% for check in ['cpu_alert', 'memory_alert', 'disk_alert', 'load_alert', 'docker_alert', 'services_alert', 'network_alert'] %} echo "" >> "$alert_file" echo "{{ check | upper | replace('_ALERT', '') }}:" >> "$alert_file" echo "{{ hostvars[inventory_hostname][check].stdout | default('No output') }}" >> "$alert_file" {% endfor %} echo "Alert report saved to: $alert_file" register: alert_report - name: Send NTFY notification for critical alerts uri: url: "{{ notifications.ntfy_url }}" method: POST body: | ๐Ÿšจ CRITICAL ALERT: {{ inventory_hostname }} Status: {{ alert_summary.overall_status }} Critical: {{ alert_summary.critical_count }} Warnings: {{ alert_summary.warning_count }} Time: {{ ansible_date_time.iso8601 }} headers: Title: "Homelab Critical Alert" Priority: "urgent" Tags: "warning,critical,{{ inventory_hostname }}" when: - alert_summary.overall_status == "CRITICAL" - alert_mode | default(default_alert_mode) != "silent" - notifications.ntfy_url != "" ignore_errors: yes - name: Send NTFY notification for warning alerts uri: url: "{{ notifications.ntfy_url }}" method: POST body: | โš ๏ธ WARNING: {{ inventory_hostname }} Status: {{ alert_summary.overall_status }} Warnings: {{ alert_summary.warning_count }} Time: {{ ansible_date_time.iso8601 }} headers: Title: "Homelab Warning" Priority: "default" Tags: "warning,{{ inventory_hostname }}" when: - alert_summary.overall_status == "WARNING" - alert_mode | default(default_alert_mode) != "silent" - notifications.ntfy_url != "" ignore_errors: yes - name: Send test notification uri: url: "{{ notifications.ntfy_url }}" method: POST body: | ๐Ÿงช TEST ALERT: {{ inventory_hostname }} This is a test notification from the alert monitoring system. Status: {{ alert_summary.overall_status }} Time: {{ ansible_date_time.iso8601 }} headers: Title: "Homelab Alert Test" Priority: "low" Tags: "test,{{ inventory_hostname }}" when: - alert_mode | default(default_alert_mode) == "test" - notifications.ntfy_url != "" ignore_errors: yes - name: Display alert summary debug: msg: | ๐Ÿšจ ALERT MONITORING COMPLETE ============================ ๐Ÿ–ฅ๏ธ Host: {{ inventory_hostname }} ๐Ÿ“… Date: {{ ansible_date_time.date }} ๐Ÿ”” Mode: {{ alert_mode | default(default_alert_mode) }} ๐Ÿ“Š ALERT SUMMARY: Overall Status: {{ alert_summary.overall_status }} Critical Alerts: {{ alert_summary.critical_count }} Warning Alerts: {{ alert_summary.warning_count }} ๐Ÿ“‹ CHECK RESULTS: {% for check in ['cpu_alert', 'memory_alert', 'disk_alert', 'load_alert', 'docker_alert', 'services_alert', 'network_alert'] %} {{ check | replace('_alert', '') | upper }}: {{ 'CRITICAL' if hostvars[inventory_hostname][check].rc | default(0) == 2 else 'WARNING' if hostvars[inventory_hostname][check].rc | default(0) == 1 else 'OK' }} {% endfor %} {{ alert_report.stdout }} ๐Ÿ” Next Steps: {% if alert_summary.overall_status == "CRITICAL" %} - ๐Ÿšจ IMMEDIATE ACTION REQUIRED - Review critical alerts above - Check system resources and services {% elif alert_summary.overall_status == "WARNING" %} - โš ๏ธ Monitor system closely - Consider preventive maintenance {% else %} - โœ… System is healthy - Continue regular monitoring {% endif %} - Schedule regular checks: crontab -e - View full report: cat {{ alert_config_dir }}/{{ inventory_hostname }}/alert_report_*.txt ============================