Sanitized mirror from private repository - 2026-04-19 08:46:29 UTC
This commit is contained in:
418
ansible/automation/playbooks/alert_check.yml
Normal file
418
ansible/automation/playbooks/alert_check.yml
Normal file
@@ -0,0 +1,418 @@
|
||||
---
|
||||
# Alert Check and Notification Playbook
|
||||
# Monitors system conditions and sends alerts when thresholds are exceeded
|
||||
# Usage: ansible-playbook playbooks/alert_check.yml
|
||||
# Usage: ansible-playbook playbooks/alert_check.yml -e "alert_mode=test"
|
||||
|
||||
- name: Infrastructure Alert Monitoring
|
||||
hosts: all
|
||||
gather_facts: yes
|
||||
vars:
|
||||
alert_config_dir: "/tmp/alerts"
|
||||
default_alert_mode: "production" # production, test, silent
|
||||
|
||||
# Alert thresholds
|
||||
thresholds:
|
||||
cpu:
|
||||
warning: 80
|
||||
critical: 95
|
||||
memory:
|
||||
warning: 85
|
||||
critical: 95
|
||||
disk:
|
||||
warning: 85
|
||||
critical: 95
|
||||
load:
|
||||
warning: 4.0
|
||||
critical: 8.0
|
||||
container_down_critical: 1 # Number of containers down to trigger critical
|
||||
|
||||
# Notification settings
|
||||
notifications:
|
||||
ntfy_url: "{{ ntfy_url | default('https://ntfy.sh/REDACTED_TOPIC') }}"
|
||||
email_enabled: "{{ email_enabled | default(false) }}"
|
||||
slack_webhook: "{{ slack_webhook | default('') }}"
|
||||
|
||||
tasks:
|
||||
- name: Create alert configuration directory
|
||||
file:
|
||||
path: "{{ alert_config_dir }}/{{ inventory_hostname }}"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: Display alert monitoring plan
|
||||
debug:
|
||||
msg: |
|
||||
🚨 ALERT MONITORING INITIATED
|
||||
=============================
|
||||
🖥️ Host: {{ inventory_hostname }}
|
||||
📅 Date: {{ ansible_date_time.date }}
|
||||
🔔 Mode: {{ alert_mode | default(default_alert_mode) }}
|
||||
📊 CPU: {{ thresholds.cpu.warning }}%/{{ thresholds.cpu.critical }}%
|
||||
💾 Memory: {{ thresholds.memory.warning }}%/{{ thresholds.memory.critical }}%
|
||||
💿 Disk: {{ thresholds.disk.warning }}%/{{ thresholds.disk.critical }}%
|
||||
⚖️ Load: {{ thresholds.load.warning }}/{{ thresholds.load.critical }}
|
||||
|
||||
- name: Check CPU usage with alerting
|
||||
shell: |
|
||||
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | awk -F'%' '{print $1}')
|
||||
if [ -z "$cpu_usage" ]; then
|
||||
cpu_usage=$(vmstat 1 2 | tail -1 | awk '{print 100-$15}')
|
||||
fi
|
||||
|
||||
cpu_int=$(echo "$cpu_usage" | cut -d'.' -f1)
|
||||
|
||||
echo "🖥️ CPU Usage: ${cpu_usage}%"
|
||||
|
||||
if [ "$cpu_int" -gt "{{ thresholds.cpu.critical }}" ]; then
|
||||
echo "CRITICAL:CPU:${cpu_usage}%"
|
||||
exit 2
|
||||
elif [ "$cpu_int" -gt "{{ thresholds.cpu.warning }}" ]; then
|
||||
echo "WARNING:CPU:${cpu_usage}%"
|
||||
exit 1
|
||||
else
|
||||
echo "OK:CPU:${cpu_usage}%"
|
||||
exit 0
|
||||
fi
|
||||
register: cpu_alert
|
||||
failed_when: false
|
||||
|
||||
- name: Check memory usage with alerting
|
||||
shell: |
|
||||
memory_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')
|
||||
|
||||
echo "💾 Memory Usage: ${memory_usage}%"
|
||||
|
||||
if [ "$memory_usage" -gt "{{ thresholds.memory.critical }}" ]; then
|
||||
echo "CRITICAL:MEMORY:${memory_usage}%"
|
||||
exit 2
|
||||
elif [ "$memory_usage" -gt "{{ thresholds.memory.warning }}" ]; then
|
||||
echo "WARNING:MEMORY:${memory_usage}%"
|
||||
exit 1
|
||||
else
|
||||
echo "OK:MEMORY:${memory_usage}%"
|
||||
exit 0
|
||||
fi
|
||||
register: memory_alert
|
||||
failed_when: false
|
||||
|
||||
- name: Check disk usage with alerting
|
||||
shell: |
|
||||
critical_disks=""
|
||||
warning_disks=""
|
||||
|
||||
echo "💿 Disk Usage Check:"
|
||||
df -h | awk 'NR>1 {print $5 " " $6}' | while read output; do
|
||||
usage=$(echo $output | awk '{print $1}' | sed 's/%//')
|
||||
partition=$(echo $output | awk '{print $2}')
|
||||
|
||||
echo " $partition: ${usage}%"
|
||||
|
||||
if [ "$usage" -gt "{{ thresholds.disk.critical }}" ]; then
|
||||
echo "CRITICAL:DISK:$partition:${usage}%"
|
||||
echo "$partition:$usage" >> /tmp/critical_disks_$$
|
||||
elif [ "$usage" -gt "{{ thresholds.disk.warning }}" ]; then
|
||||
echo "WARNING:DISK:$partition:${usage}%"
|
||||
echo "$partition:$usage" >> /tmp/warning_disks_$$
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -f /tmp/critical_disks_$$ ]; then
|
||||
echo "Critical disk alerts:"
|
||||
cat /tmp/critical_disks_$$
|
||||
rm -f /tmp/critical_disks_$$ /tmp/warning_disks_$$
|
||||
exit 2
|
||||
elif [ -f /tmp/warning_disks_$$ ]; then
|
||||
echo "Disk warnings:"
|
||||
cat /tmp/warning_disks_$$
|
||||
rm -f /tmp/warning_disks_$$
|
||||
exit 1
|
||||
else
|
||||
echo "OK:DISK:All partitions normal"
|
||||
exit 0
|
||||
fi
|
||||
register: disk_alert
|
||||
failed_when: false
|
||||
|
||||
- name: Check load average with alerting
|
||||
shell: |
|
||||
load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | sed 's/,//')
|
||||
|
||||
echo "⚖️ Load Average (1min): $load_avg"
|
||||
|
||||
# Use bc for floating point comparison if available, otherwise use awk
|
||||
if command -v bc &> /dev/null; then
|
||||
critical_check=$(echo "$load_avg > {{ thresholds.load.critical }}" | bc -l)
|
||||
warning_check=$(echo "$load_avg > {{ thresholds.load.warning }}" | bc -l)
|
||||
else
|
||||
critical_check=$(awk "BEGIN {print ($load_avg > {{ thresholds.load.critical }})}")
|
||||
warning_check=$(awk "BEGIN {print ($load_avg > {{ thresholds.load.warning }})}")
|
||||
fi
|
||||
|
||||
if [ "$critical_check" = "1" ]; then
|
||||
echo "CRITICAL:LOAD:${load_avg}"
|
||||
exit 2
|
||||
elif [ "$warning_check" = "1" ]; then
|
||||
echo "WARNING:LOAD:${load_avg}"
|
||||
exit 1
|
||||
else
|
||||
echo "OK:LOAD:${load_avg}"
|
||||
exit 0
|
||||
fi
|
||||
register: load_alert
|
||||
failed_when: false
|
||||
|
||||
- name: Check Docker container health
|
||||
shell: |
|
||||
if command -v docker &> /dev/null && docker info &> /dev/null; then
|
||||
total_containers=$(docker ps -a -q | wc -l)
|
||||
running_containers=$(docker ps -q | wc -l)
|
||||
unhealthy_containers=$(docker ps --filter health=unhealthy -q | wc -l)
|
||||
stopped_containers=$((total_containers - running_containers))
|
||||
|
||||
echo "🐳 Docker Container Status:"
|
||||
echo " Total: $total_containers"
|
||||
echo " Running: $running_containers"
|
||||
echo " Stopped: $stopped_containers"
|
||||
echo " Unhealthy: $unhealthy_containers"
|
||||
|
||||
if [ "$unhealthy_containers" -gt "0" ] || [ "$stopped_containers" -gt "{{ thresholds.container_down_critical }}" ]; then
|
||||
echo "CRITICAL:DOCKER:$stopped_containers stopped, $unhealthy_containers unhealthy"
|
||||
exit 2
|
||||
elif [ "$stopped_containers" -gt "0" ]; then
|
||||
echo "WARNING:DOCKER:$stopped_containers containers stopped"
|
||||
exit 1
|
||||
else
|
||||
echo "OK:DOCKER:All containers healthy"
|
||||
exit 0
|
||||
fi
|
||||
else
|
||||
echo "ℹ️ Docker not available - skipping container checks"
|
||||
echo "OK:DOCKER:Not installed"
|
||||
exit 0
|
||||
fi
|
||||
register: docker_alert
|
||||
failed_when: false
|
||||
|
||||
- name: Check critical services
|
||||
shell: |
|
||||
critical_services=("ssh" "systemd-resolved")
|
||||
failed_services=""
|
||||
|
||||
echo "🔧 Critical Services Check:"
|
||||
|
||||
for service in "${critical_services[@]}"; do
|
||||
if systemctl is-active --quiet "$service" 2>/dev/null; then
|
||||
echo " ✅ $service: running"
|
||||
else
|
||||
echo " 🚨 $service: not running"
|
||||
failed_services="$failed_services $service"
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -n "$failed_services" ]; then
|
||||
echo "CRITICAL:SERVICES:$failed_services"
|
||||
exit 2
|
||||
else
|
||||
echo "OK:SERVICES:All critical services running"
|
||||
exit 0
|
||||
fi
|
||||
register: services_alert
|
||||
failed_when: false
|
||||
|
||||
- name: Check network connectivity
|
||||
shell: |
|
||||
echo "🌐 Network Connectivity Check:"
|
||||
|
||||
# Check internet connectivity
|
||||
if ping -c 1 -W 5 8.8.8.8 &> /dev/null; then
|
||||
echo " ✅ Internet: OK"
|
||||
internet_status="OK"
|
||||
else
|
||||
echo " 🚨 Internet: FAILED"
|
||||
internet_status="FAILED"
|
||||
fi
|
||||
|
||||
# Check DNS resolution
|
||||
if nslookup google.com &> /dev/null; then
|
||||
echo " ✅ DNS: OK"
|
||||
dns_status="OK"
|
||||
else
|
||||
echo " ⚠️ DNS: FAILED"
|
||||
dns_status="FAILED"
|
||||
fi
|
||||
|
||||
if [ "$internet_status" = "FAILED" ]; then
|
||||
echo "CRITICAL:NETWORK:No internet connectivity"
|
||||
exit 2
|
||||
elif [ "$dns_status" = "FAILED" ]; then
|
||||
echo "WARNING:NETWORK:DNS resolution issues"
|
||||
exit 1
|
||||
else
|
||||
echo "OK:NETWORK:All connectivity normal"
|
||||
exit 0
|
||||
fi
|
||||
register: network_alert
|
||||
failed_when: false
|
||||
|
||||
- name: Evaluate overall alert status
|
||||
set_fact:
|
||||
alert_summary:
|
||||
critical_count: >-
|
||||
{{
|
||||
[cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
|
||||
| selectattr('rc', 'defined')
|
||||
| selectattr('rc', 'equalto', 2)
|
||||
| list
|
||||
| length
|
||||
}}
|
||||
warning_count: >-
|
||||
{{
|
||||
[cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
|
||||
| selectattr('rc', 'defined')
|
||||
| selectattr('rc', 'equalto', 1)
|
||||
| list
|
||||
| length
|
||||
}}
|
||||
overall_status: >-
|
||||
{{
|
||||
'CRITICAL' if (
|
||||
[cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
|
||||
| selectattr('rc', 'defined')
|
||||
| selectattr('rc', 'equalto', 2)
|
||||
| list
|
||||
| length > 0
|
||||
) else 'WARNING' if (
|
||||
[cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
|
||||
| selectattr('rc', 'defined')
|
||||
| selectattr('rc', 'equalto', 1)
|
||||
| list
|
||||
| length > 0
|
||||
) else 'OK'
|
||||
}}
|
||||
|
||||
- name: Generate alert report
|
||||
shell: |
|
||||
alert_file="{{ alert_config_dir }}/{{ inventory_hostname }}/alert_report_{{ ansible_date_time.epoch }}.txt"
|
||||
|
||||
echo "🚨 INFRASTRUCTURE ALERT REPORT" > "$alert_file"
|
||||
echo "===============================" >> "$alert_file"
|
||||
echo "Host: {{ inventory_hostname }}" >> "$alert_file"
|
||||
echo "Date: {{ ansible_date_time.iso8601 }}" >> "$alert_file"
|
||||
echo "Overall Status: {{ alert_summary.overall_status }}" >> "$alert_file"
|
||||
echo "Critical Alerts: {{ alert_summary.critical_count }}" >> "$alert_file"
|
||||
echo "Warning Alerts: {{ alert_summary.warning_count }}" >> "$alert_file"
|
||||
echo "" >> "$alert_file"
|
||||
|
||||
echo "📊 DETAILED RESULTS:" >> "$alert_file"
|
||||
echo "===================" >> "$alert_file"
|
||||
{% for check in ['cpu_alert', 'memory_alert', 'disk_alert', 'load_alert', 'docker_alert', 'services_alert', 'network_alert'] %}
|
||||
echo "" >> "$alert_file"
|
||||
echo "{{ check | upper | replace('_ALERT', '') }}:" >> "$alert_file"
|
||||
echo "{{ hostvars[inventory_hostname][check].stdout | default('No output') }}" >> "$alert_file"
|
||||
{% endfor %}
|
||||
|
||||
echo "Alert report saved to: $alert_file"
|
||||
register: alert_report
|
||||
|
||||
- name: Send NTFY notification for critical alerts
|
||||
uri:
|
||||
url: "{{ notifications.ntfy_url }}"
|
||||
method: POST
|
||||
body: |
|
||||
🚨 CRITICAL ALERT: {{ inventory_hostname }}
|
||||
|
||||
Status: {{ alert_summary.overall_status }}
|
||||
Critical: {{ alert_summary.critical_count }}
|
||||
Warnings: {{ alert_summary.warning_count }}
|
||||
|
||||
Time: {{ ansible_date_time.iso8601 }}
|
||||
headers:
|
||||
Title: "Homelab Critical Alert"
|
||||
Priority: "urgent"
|
||||
Tags: "warning,critical,{{ inventory_hostname }}"
|
||||
when:
|
||||
- alert_summary.overall_status == "CRITICAL"
|
||||
- alert_mode | default(default_alert_mode) != "silent"
|
||||
- notifications.ntfy_url != ""
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Send NTFY notification for warning alerts
|
||||
uri:
|
||||
url: "{{ notifications.ntfy_url }}"
|
||||
method: POST
|
||||
body: |
|
||||
⚠️ WARNING: {{ inventory_hostname }}
|
||||
|
||||
Status: {{ alert_summary.overall_status }}
|
||||
Warnings: {{ alert_summary.warning_count }}
|
||||
|
||||
Time: {{ ansible_date_time.iso8601 }}
|
||||
headers:
|
||||
Title: "Homelab Warning"
|
||||
Priority: "default"
|
||||
Tags: "warning,{{ inventory_hostname }}"
|
||||
when:
|
||||
- alert_summary.overall_status == "WARNING"
|
||||
- alert_mode | default(default_alert_mode) != "silent"
|
||||
- notifications.ntfy_url != ""
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Send test notification
|
||||
uri:
|
||||
url: "{{ notifications.ntfy_url }}"
|
||||
method: POST
|
||||
body: |
|
||||
🧪 TEST ALERT: {{ inventory_hostname }}
|
||||
|
||||
This is a test notification from the alert monitoring system.
|
||||
|
||||
Status: {{ alert_summary.overall_status }}
|
||||
Time: {{ ansible_date_time.iso8601 }}
|
||||
headers:
|
||||
Title: "Homelab Alert Test"
|
||||
Priority: "low"
|
||||
Tags: "test,{{ inventory_hostname }}"
|
||||
when:
|
||||
- alert_mode | default(default_alert_mode) == "test"
|
||||
- notifications.ntfy_url != ""
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Display alert summary
|
||||
debug:
|
||||
msg: |
|
||||
|
||||
🚨 ALERT MONITORING COMPLETE
|
||||
============================
|
||||
🖥️ Host: {{ inventory_hostname }}
|
||||
📅 Date: {{ ansible_date_time.date }}
|
||||
🔔 Mode: {{ alert_mode | default(default_alert_mode) }}
|
||||
|
||||
📊 ALERT SUMMARY:
|
||||
Overall Status: {{ alert_summary.overall_status }}
|
||||
Critical Alerts: {{ alert_summary.critical_count }}
|
||||
Warning Alerts: {{ alert_summary.warning_count }}
|
||||
|
||||
📋 CHECK RESULTS:
|
||||
{% for check in ['cpu_alert', 'memory_alert', 'disk_alert', 'load_alert', 'docker_alert', 'services_alert', 'network_alert'] %}
|
||||
{{ check | replace('_alert', '') | upper }}: {{ 'CRITICAL' if hostvars[inventory_hostname][check].rc | default(0) == 2 else 'WARNING' if hostvars[inventory_hostname][check].rc | default(0) == 1 else 'OK' }}
|
||||
{% endfor %}
|
||||
|
||||
{{ alert_report.stdout }}
|
||||
|
||||
🔍 Next Steps:
|
||||
{% if alert_summary.overall_status == "CRITICAL" %}
|
||||
- 🚨 IMMEDIATE ACTION REQUIRED
|
||||
- Review critical alerts above
|
||||
- Check system resources and services
|
||||
{% elif alert_summary.overall_status == "WARNING" %}
|
||||
- ⚠️ Monitor system closely
|
||||
- Consider preventive maintenance
|
||||
{% else %}
|
||||
- ✅ System is healthy
|
||||
- Continue regular monitoring
|
||||
{% endif %}
|
||||
- Schedule regular checks: crontab -e
|
||||
- View full report: cat {{ alert_config_dir }}/{{ inventory_hostname }}/alert_report_*.txt
|
||||
|
||||
============================
|
||||
Reference in New Issue
Block a user