419 lines
15 KiB
YAML
419 lines
15 KiB
YAML
---
|
||
# Alert Check and Notification Playbook
|
||
# Monitors system conditions and sends alerts when thresholds are exceeded
|
||
# Usage: ansible-playbook playbooks/alert_check.yml
|
||
# Usage: ansible-playbook playbooks/alert_check.yml -e "alert_mode=test"
|
||
|
||
- name: Infrastructure Alert Monitoring
|
||
hosts: all
|
||
gather_facts: yes
|
||
vars:
|
||
alert_config_dir: "/tmp/alerts"
|
||
default_alert_mode: "production" # production, test, silent
|
||
|
||
# Alert thresholds
|
||
thresholds:
|
||
cpu:
|
||
warning: 80
|
||
critical: 95
|
||
memory:
|
||
warning: 85
|
||
critical: 95
|
||
disk:
|
||
warning: 85
|
||
critical: 95
|
||
load:
|
||
warning: 4.0
|
||
critical: 8.0
|
||
container_down_critical: 1 # Number of containers down to trigger critical
|
||
|
||
# Notification settings
|
||
notifications:
|
||
ntfy_url: "{{ ntfy_url | default('https://ntfy.sh/REDACTED_TOPIC') }}"
|
||
email_enabled: "{{ email_enabled | default(false) }}"
|
||
slack_webhook: "{{ slack_webhook | default('') }}"
|
||
|
||
tasks:
|
||
- name: Create alert configuration directory
|
||
file:
|
||
path: "{{ alert_config_dir }}/{{ inventory_hostname }}"
|
||
state: directory
|
||
mode: '0755'
|
||
|
||
- name: Display alert monitoring plan
|
||
debug:
|
||
msg: |
|
||
🚨 ALERT MONITORING INITIATED
|
||
=============================
|
||
🖥️ Host: {{ inventory_hostname }}
|
||
📅 Date: {{ ansible_date_time.date }}
|
||
🔔 Mode: {{ alert_mode | default(default_alert_mode) }}
|
||
📊 CPU: {{ thresholds.cpu.warning }}%/{{ thresholds.cpu.critical }}%
|
||
💾 Memory: {{ thresholds.memory.warning }}%/{{ thresholds.memory.critical }}%
|
||
💿 Disk: {{ thresholds.disk.warning }}%/{{ thresholds.disk.critical }}%
|
||
⚖️ Load: {{ thresholds.load.warning }}/{{ thresholds.load.critical }}
|
||
|
||
- name: Check CPU usage with alerting
|
||
shell: |
|
||
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | awk -F'%' '{print $1}')
|
||
if [ -z "$cpu_usage" ]; then
|
||
cpu_usage=$(vmstat 1 2 | tail -1 | awk '{print 100-$15}')
|
||
fi
|
||
|
||
cpu_int=$(echo "$cpu_usage" | cut -d'.' -f1)
|
||
|
||
echo "🖥️ CPU Usage: ${cpu_usage}%"
|
||
|
||
if [ "$cpu_int" -gt "{{ thresholds.cpu.critical }}" ]; then
|
||
echo "CRITICAL:CPU:${cpu_usage}%"
|
||
exit 2
|
||
elif [ "$cpu_int" -gt "{{ thresholds.cpu.warning }}" ]; then
|
||
echo "WARNING:CPU:${cpu_usage}%"
|
||
exit 1
|
||
else
|
||
echo "OK:CPU:${cpu_usage}%"
|
||
exit 0
|
||
fi
|
||
register: cpu_alert
|
||
failed_when: false
|
||
|
||
- name: Check memory usage with alerting
|
||
shell: |
|
||
memory_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')
|
||
|
||
echo "💾 Memory Usage: ${memory_usage}%"
|
||
|
||
if [ "$memory_usage" -gt "{{ thresholds.memory.critical }}" ]; then
|
||
echo "CRITICAL:MEMORY:${memory_usage}%"
|
||
exit 2
|
||
elif [ "$memory_usage" -gt "{{ thresholds.memory.warning }}" ]; then
|
||
echo "WARNING:MEMORY:${memory_usage}%"
|
||
exit 1
|
||
else
|
||
echo "OK:MEMORY:${memory_usage}%"
|
||
exit 0
|
||
fi
|
||
register: memory_alert
|
||
failed_when: false
|
||
|
||
- name: Check disk usage with alerting
|
||
shell: |
|
||
critical_disks=""
|
||
warning_disks=""
|
||
|
||
echo "💿 Disk Usage Check:"
|
||
df -h | awk 'NR>1 {print $5 " " $6}' | while read output; do
|
||
usage=$(echo $output | awk '{print $1}' | sed 's/%//')
|
||
partition=$(echo $output | awk '{print $2}')
|
||
|
||
echo " $partition: ${usage}%"
|
||
|
||
if [ "$usage" -gt "{{ thresholds.disk.critical }}" ]; then
|
||
echo "CRITICAL:DISK:$partition:${usage}%"
|
||
echo "$partition:$usage" >> /tmp/critical_disks_$$
|
||
elif [ "$usage" -gt "{{ thresholds.disk.warning }}" ]; then
|
||
echo "WARNING:DISK:$partition:${usage}%"
|
||
echo "$partition:$usage" >> /tmp/warning_disks_$$
|
||
fi
|
||
done
|
||
|
||
if [ -f /tmp/critical_disks_$$ ]; then
|
||
echo "Critical disk alerts:"
|
||
cat /tmp/critical_disks_$$
|
||
rm -f /tmp/critical_disks_$$ /tmp/warning_disks_$$
|
||
exit 2
|
||
elif [ -f /tmp/warning_disks_$$ ]; then
|
||
echo "Disk warnings:"
|
||
cat /tmp/warning_disks_$$
|
||
rm -f /tmp/warning_disks_$$
|
||
exit 1
|
||
else
|
||
echo "OK:DISK:All partitions normal"
|
||
exit 0
|
||
fi
|
||
register: disk_alert
|
||
failed_when: false
|
||
|
||
- name: Check load average with alerting
|
||
shell: |
|
||
load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | sed 's/,//')
|
||
|
||
echo "⚖️ Load Average (1min): $load_avg"
|
||
|
||
# Use bc for floating point comparison if available, otherwise use awk
|
||
if command -v bc &> /dev/null; then
|
||
critical_check=$(echo "$load_avg > {{ thresholds.load.critical }}" | bc -l)
|
||
warning_check=$(echo "$load_avg > {{ thresholds.load.warning }}" | bc -l)
|
||
else
|
||
critical_check=$(awk "BEGIN {print ($load_avg > {{ thresholds.load.critical }})}")
|
||
warning_check=$(awk "BEGIN {print ($load_avg > {{ thresholds.load.warning }})}")
|
||
fi
|
||
|
||
if [ "$critical_check" = "1" ]; then
|
||
echo "CRITICAL:LOAD:${load_avg}"
|
||
exit 2
|
||
elif [ "$warning_check" = "1" ]; then
|
||
echo "WARNING:LOAD:${load_avg}"
|
||
exit 1
|
||
else
|
||
echo "OK:LOAD:${load_avg}"
|
||
exit 0
|
||
fi
|
||
register: load_alert
|
||
failed_when: false
|
||
|
||
- name: Check Docker container health
|
||
shell: |
|
||
if command -v docker &> /dev/null && docker info &> /dev/null; then
|
||
total_containers=$(docker ps -a -q | wc -l)
|
||
running_containers=$(docker ps -q | wc -l)
|
||
unhealthy_containers=$(docker ps --filter health=unhealthy -q | wc -l)
|
||
stopped_containers=$((total_containers - running_containers))
|
||
|
||
echo "🐳 Docker Container Status:"
|
||
echo " Total: $total_containers"
|
||
echo " Running: $running_containers"
|
||
echo " Stopped: $stopped_containers"
|
||
echo " Unhealthy: $unhealthy_containers"
|
||
|
||
if [ "$unhealthy_containers" -gt "0" ] || [ "$stopped_containers" -gt "{{ thresholds.container_down_critical }}" ]; then
|
||
echo "CRITICAL:DOCKER:$stopped_containers stopped, $unhealthy_containers unhealthy"
|
||
exit 2
|
||
elif [ "$stopped_containers" -gt "0" ]; then
|
||
echo "WARNING:DOCKER:$stopped_containers containers stopped"
|
||
exit 1
|
||
else
|
||
echo "OK:DOCKER:All containers healthy"
|
||
exit 0
|
||
fi
|
||
else
|
||
echo "ℹ️ Docker not available - skipping container checks"
|
||
echo "OK:DOCKER:Not installed"
|
||
exit 0
|
||
fi
|
||
register: docker_alert
|
||
failed_when: false
|
||
|
||
- name: Check critical services
|
||
shell: |
|
||
critical_services=("ssh" "systemd-resolved")
|
||
failed_services=""
|
||
|
||
echo "🔧 Critical Services Check:"
|
||
|
||
for service in "${critical_services[@]}"; do
|
||
if systemctl is-active --quiet "$service" 2>/dev/null; then
|
||
echo " ✅ $service: running"
|
||
else
|
||
echo " 🚨 $service: not running"
|
||
failed_services="$failed_services $service"
|
||
fi
|
||
done
|
||
|
||
if [ -n "$failed_services" ]; then
|
||
echo "CRITICAL:SERVICES:$failed_services"
|
||
exit 2
|
||
else
|
||
echo "OK:SERVICES:All critical services running"
|
||
exit 0
|
||
fi
|
||
register: services_alert
|
||
failed_when: false
|
||
|
||
- name: Check network connectivity
|
||
shell: |
|
||
echo "🌐 Network Connectivity Check:"
|
||
|
||
# Check internet connectivity
|
||
if ping -c 1 -W 5 8.8.8.8 &> /dev/null; then
|
||
echo " ✅ Internet: OK"
|
||
internet_status="OK"
|
||
else
|
||
echo " 🚨 Internet: FAILED"
|
||
internet_status="FAILED"
|
||
fi
|
||
|
||
# Check DNS resolution
|
||
if nslookup google.com &> /dev/null; then
|
||
echo " ✅ DNS: OK"
|
||
dns_status="OK"
|
||
else
|
||
echo " ⚠️ DNS: FAILED"
|
||
dns_status="FAILED"
|
||
fi
|
||
|
||
if [ "$internet_status" = "FAILED" ]; then
|
||
echo "CRITICAL:NETWORK:No internet connectivity"
|
||
exit 2
|
||
elif [ "$dns_status" = "FAILED" ]; then
|
||
echo "WARNING:NETWORK:DNS resolution issues"
|
||
exit 1
|
||
else
|
||
echo "OK:NETWORK:All connectivity normal"
|
||
exit 0
|
||
fi
|
||
register: network_alert
|
||
failed_when: false
|
||
|
||
- name: Evaluate overall alert status
|
||
set_fact:
|
||
alert_summary:
|
||
critical_count: >-
|
||
{{
|
||
[cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
|
||
| selectattr('rc', 'defined')
|
||
| selectattr('rc', 'equalto', 2)
|
||
| list
|
||
| length
|
||
}}
|
||
warning_count: >-
|
||
{{
|
||
[cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
|
||
| selectattr('rc', 'defined')
|
||
| selectattr('rc', 'equalto', 1)
|
||
| list
|
||
| length
|
||
}}
|
||
overall_status: >-
|
||
{{
|
||
'CRITICAL' if (
|
||
[cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
|
||
| selectattr('rc', 'defined')
|
||
| selectattr('rc', 'equalto', 2)
|
||
| list
|
||
| length > 0
|
||
) else 'WARNING' if (
|
||
[cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
|
||
| selectattr('rc', 'defined')
|
||
| selectattr('rc', 'equalto', 1)
|
||
| list
|
||
| length > 0
|
||
) else 'OK'
|
||
}}
|
||
|
||
- name: Generate alert report
|
||
shell: |
|
||
alert_file="{{ alert_config_dir }}/{{ inventory_hostname }}/alert_report_{{ ansible_date_time.epoch }}.txt"
|
||
|
||
echo "🚨 INFRASTRUCTURE ALERT REPORT" > "$alert_file"
|
||
echo "===============================" >> "$alert_file"
|
||
echo "Host: {{ inventory_hostname }}" >> "$alert_file"
|
||
echo "Date: {{ ansible_date_time.iso8601 }}" >> "$alert_file"
|
||
echo "Overall Status: {{ alert_summary.overall_status }}" >> "$alert_file"
|
||
echo "Critical Alerts: {{ alert_summary.critical_count }}" >> "$alert_file"
|
||
echo "Warning Alerts: {{ alert_summary.warning_count }}" >> "$alert_file"
|
||
echo "" >> "$alert_file"
|
||
|
||
echo "📊 DETAILED RESULTS:" >> "$alert_file"
|
||
echo "===================" >> "$alert_file"
|
||
{% for check in ['cpu_alert', 'memory_alert', 'disk_alert', 'load_alert', 'docker_alert', 'services_alert', 'network_alert'] %}
|
||
echo "" >> "$alert_file"
|
||
echo "{{ check | upper | replace('_ALERT', '') }}:" >> "$alert_file"
|
||
echo "{{ hostvars[inventory_hostname][check].stdout | default('No output') }}" >> "$alert_file"
|
||
{% endfor %}
|
||
|
||
echo "Alert report saved to: $alert_file"
|
||
register: alert_report
|
||
|
||
- name: Send NTFY notification for critical alerts
|
||
uri:
|
||
url: "{{ notifications.ntfy_url }}"
|
||
method: POST
|
||
body: |
|
||
🚨 CRITICAL ALERT: {{ inventory_hostname }}
|
||
|
||
Status: {{ alert_summary.overall_status }}
|
||
Critical: {{ alert_summary.critical_count }}
|
||
Warnings: {{ alert_summary.warning_count }}
|
||
|
||
Time: {{ ansible_date_time.iso8601 }}
|
||
headers:
|
||
Title: "Homelab Critical Alert"
|
||
Priority: "urgent"
|
||
Tags: "warning,critical,{{ inventory_hostname }}"
|
||
when:
|
||
- alert_summary.overall_status == "CRITICAL"
|
||
- alert_mode | default(default_alert_mode) != "silent"
|
||
- notifications.ntfy_url != ""
|
||
ignore_errors: yes
|
||
|
||
- name: Send NTFY notification for warning alerts
|
||
uri:
|
||
url: "{{ notifications.ntfy_url }}"
|
||
method: POST
|
||
body: |
|
||
⚠️ WARNING: {{ inventory_hostname }}
|
||
|
||
Status: {{ alert_summary.overall_status }}
|
||
Warnings: {{ alert_summary.warning_count }}
|
||
|
||
Time: {{ ansible_date_time.iso8601 }}
|
||
headers:
|
||
Title: "Homelab Warning"
|
||
Priority: "default"
|
||
Tags: "warning,{{ inventory_hostname }}"
|
||
when:
|
||
- alert_summary.overall_status == "WARNING"
|
||
- alert_mode | default(default_alert_mode) != "silent"
|
||
- notifications.ntfy_url != ""
|
||
ignore_errors: yes
|
||
|
||
- name: Send test notification
|
||
uri:
|
||
url: "{{ notifications.ntfy_url }}"
|
||
method: POST
|
||
body: |
|
||
🧪 TEST ALERT: {{ inventory_hostname }}
|
||
|
||
This is a test notification from the alert monitoring system.
|
||
|
||
Status: {{ alert_summary.overall_status }}
|
||
Time: {{ ansible_date_time.iso8601 }}
|
||
headers:
|
||
Title: "Homelab Alert Test"
|
||
Priority: "low"
|
||
Tags: "test,{{ inventory_hostname }}"
|
||
when:
|
||
- alert_mode | default(default_alert_mode) == "test"
|
||
- notifications.ntfy_url != ""
|
||
ignore_errors: yes
|
||
|
||
- name: Display alert summary
|
||
debug:
|
||
msg: |
|
||
|
||
🚨 ALERT MONITORING COMPLETE
|
||
============================
|
||
🖥️ Host: {{ inventory_hostname }}
|
||
📅 Date: {{ ansible_date_time.date }}
|
||
🔔 Mode: {{ alert_mode | default(default_alert_mode) }}
|
||
|
||
📊 ALERT SUMMARY:
|
||
Overall Status: {{ alert_summary.overall_status }}
|
||
Critical Alerts: {{ alert_summary.critical_count }}
|
||
Warning Alerts: {{ alert_summary.warning_count }}
|
||
|
||
📋 CHECK RESULTS:
|
||
{% for check in ['cpu_alert', 'memory_alert', 'disk_alert', 'load_alert', 'docker_alert', 'services_alert', 'network_alert'] %}
|
||
{{ check | replace('_alert', '') | upper }}: {{ 'CRITICAL' if hostvars[inventory_hostname][check].rc | default(0) == 2 else 'WARNING' if hostvars[inventory_hostname][check].rc | default(0) == 1 else 'OK' }}
|
||
{% endfor %}
|
||
|
||
{{ alert_report.stdout }}
|
||
|
||
🔍 Next Steps:
|
||
{% if alert_summary.overall_status == "CRITICAL" %}
|
||
- 🚨 IMMEDIATE ACTION REQUIRED
|
||
- Review critical alerts above
|
||
- Check system resources and services
|
||
{% elif alert_summary.overall_status == "WARNING" %}
|
||
- ⚠️ Monitor system closely
|
||
- Consider preventive maintenance
|
||
{% else %}
|
||
- ✅ System is healthy
|
||
- Continue regular monitoring
|
||
{% endif %}
|
||
- Schedule regular checks: crontab -e
|
||
- View full report: cat {{ alert_config_dir }}/{{ inventory_hostname }}/alert_report_*.txt
|
||
|
||
============================
|