Sanitized mirror from private repository - 2026-04-19 08:46:29 UTC
Some checks failed
Documentation / Build Docusaurus (push) Failing after 17m32s
Documentation / Deploy to GitHub Pages (push) Has been skipped

This commit is contained in:
Gitea Mirror Bot
2026-04-19 08:46:29 +00:00
commit 11d496f233
1439 changed files with 363180 additions and 0 deletions

View File

@@ -0,0 +1,418 @@
---
# Alert Check and Notification Playbook
# Monitors system conditions and sends alerts when thresholds are exceeded
# Usage: ansible-playbook playbooks/alert_check.yml
# Usage: ansible-playbook playbooks/alert_check.yml -e "alert_mode=test"
- name: Infrastructure Alert Monitoring
hosts: all
gather_facts: yes
vars:
alert_config_dir: "/tmp/alerts"
default_alert_mode: "production" # production, test, silent
# Alert thresholds
thresholds:
cpu:
warning: 80
critical: 95
memory:
warning: 85
critical: 95
disk:
warning: 85
critical: 95
load:
warning: 4.0
critical: 8.0
container_down_critical: 1 # Number of containers down to trigger critical
# Notification settings
notifications:
ntfy_url: "{{ ntfy_url | default('https://ntfy.sh/REDACTED_TOPIC') }}"
email_enabled: "{{ email_enabled | default(false) }}"
slack_webhook: "{{ slack_webhook | default('') }}"
tasks:
- name: Create alert configuration directory
file:
path: "{{ alert_config_dir }}/{{ inventory_hostname }}"
state: directory
mode: '0755'
- name: Display alert monitoring plan
debug:
msg: |
🚨 ALERT MONITORING INITIATED
=============================
🖥️ Host: {{ inventory_hostname }}
📅 Date: {{ ansible_date_time.date }}
🔔 Mode: {{ alert_mode | default(default_alert_mode) }}
📊 CPU: {{ thresholds.cpu.warning }}%/{{ thresholds.cpu.critical }}%
💾 Memory: {{ thresholds.memory.warning }}%/{{ thresholds.memory.critical }}%
💿 Disk: {{ thresholds.disk.warning }}%/{{ thresholds.disk.critical }}%
⚖️ Load: {{ thresholds.load.warning }}/{{ thresholds.load.critical }}
- name: Check CPU usage with alerting
shell: |
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | awk -F'%' '{print $1}')
if [ -z "$cpu_usage" ]; then
cpu_usage=$(vmstat 1 2 | tail -1 | awk '{print 100-$15}')
fi
cpu_int=$(echo "$cpu_usage" | cut -d'.' -f1)
echo "🖥️ CPU Usage: ${cpu_usage}%"
if [ "$cpu_int" -gt "{{ thresholds.cpu.critical }}" ]; then
echo "CRITICAL:CPU:${cpu_usage}%"
exit 2
elif [ "$cpu_int" -gt "{{ thresholds.cpu.warning }}" ]; then
echo "WARNING:CPU:${cpu_usage}%"
exit 1
else
echo "OK:CPU:${cpu_usage}%"
exit 0
fi
register: cpu_alert
failed_when: false
- name: Check memory usage with alerting
shell: |
memory_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')
echo "💾 Memory Usage: ${memory_usage}%"
if [ "$memory_usage" -gt "{{ thresholds.memory.critical }}" ]; then
echo "CRITICAL:MEMORY:${memory_usage}%"
exit 2
elif [ "$memory_usage" -gt "{{ thresholds.memory.warning }}" ]; then
echo "WARNING:MEMORY:${memory_usage}%"
exit 1
else
echo "OK:MEMORY:${memory_usage}%"
exit 0
fi
register: memory_alert
failed_when: false
- name: Check disk usage with alerting
shell: |
critical_disks=""
warning_disks=""
echo "💿 Disk Usage Check:"
df -h | awk 'NR>1 {print $5 " " $6}' | while read output; do
usage=$(echo $output | awk '{print $1}' | sed 's/%//')
partition=$(echo $output | awk '{print $2}')
echo " $partition: ${usage}%"
if [ "$usage" -gt "{{ thresholds.disk.critical }}" ]; then
echo "CRITICAL:DISK:$partition:${usage}%"
echo "$partition:$usage" >> /tmp/critical_disks_$$
elif [ "$usage" -gt "{{ thresholds.disk.warning }}" ]; then
echo "WARNING:DISK:$partition:${usage}%"
echo "$partition:$usage" >> /tmp/warning_disks_$$
fi
done
if [ -f /tmp/critical_disks_$$ ]; then
echo "Critical disk alerts:"
cat /tmp/critical_disks_$$
rm -f /tmp/critical_disks_$$ /tmp/warning_disks_$$
exit 2
elif [ -f /tmp/warning_disks_$$ ]; then
echo "Disk warnings:"
cat /tmp/warning_disks_$$
rm -f /tmp/warning_disks_$$
exit 1
else
echo "OK:DISK:All partitions normal"
exit 0
fi
register: disk_alert
failed_when: false
- name: Check load average with alerting
shell: |
load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | sed 's/,//')
echo "⚖️ Load Average (1min): $load_avg"
# Use bc for floating point comparison if available, otherwise use awk
if command -v bc &> /dev/null; then
critical_check=$(echo "$load_avg > {{ thresholds.load.critical }}" | bc -l)
warning_check=$(echo "$load_avg > {{ thresholds.load.warning }}" | bc -l)
else
critical_check=$(awk "BEGIN {print ($load_avg > {{ thresholds.load.critical }})}")
warning_check=$(awk "BEGIN {print ($load_avg > {{ thresholds.load.warning }})}")
fi
if [ "$critical_check" = "1" ]; then
echo "CRITICAL:LOAD:${load_avg}"
exit 2
elif [ "$warning_check" = "1" ]; then
echo "WARNING:LOAD:${load_avg}"
exit 1
else
echo "OK:LOAD:${load_avg}"
exit 0
fi
register: load_alert
failed_when: false
- name: Check Docker container health
shell: |
if command -v docker &> /dev/null && docker info &> /dev/null; then
total_containers=$(docker ps -a -q | wc -l)
running_containers=$(docker ps -q | wc -l)
unhealthy_containers=$(docker ps --filter health=unhealthy -q | wc -l)
stopped_containers=$((total_containers - running_containers))
echo "🐳 Docker Container Status:"
echo " Total: $total_containers"
echo " Running: $running_containers"
echo " Stopped: $stopped_containers"
echo " Unhealthy: $unhealthy_containers"
if [ "$unhealthy_containers" -gt "0" ] || [ "$stopped_containers" -gt "{{ thresholds.container_down_critical }}" ]; then
echo "CRITICAL:DOCKER:$stopped_containers stopped, $unhealthy_containers unhealthy"
exit 2
elif [ "$stopped_containers" -gt "0" ]; then
echo "WARNING:DOCKER:$stopped_containers containers stopped"
exit 1
else
echo "OK:DOCKER:All containers healthy"
exit 0
fi
else
echo " Docker not available - skipping container checks"
echo "OK:DOCKER:Not installed"
exit 0
fi
register: docker_alert
failed_when: false
- name: Check critical services
shell: |
critical_services=("ssh" "systemd-resolved")
failed_services=""
echo "🔧 Critical Services Check:"
for service in "${critical_services[@]}"; do
if systemctl is-active --quiet "$service" 2>/dev/null; then
echo " ✅ $service: running"
else
echo " 🚨 $service: not running"
failed_services="$failed_services $service"
fi
done
if [ -n "$failed_services" ]; then
echo "CRITICAL:SERVICES:$failed_services"
exit 2
else
echo "OK:SERVICES:All critical services running"
exit 0
fi
register: services_alert
failed_when: false
- name: Check network connectivity
shell: |
echo "🌐 Network Connectivity Check:"
# Check internet connectivity
if ping -c 1 -W 5 8.8.8.8 &> /dev/null; then
echo " ✅ Internet: OK"
internet_status="OK"
else
echo " 🚨 Internet: FAILED"
internet_status="FAILED"
fi
# Check DNS resolution
if nslookup google.com &> /dev/null; then
echo " ✅ DNS: OK"
dns_status="OK"
else
echo " ⚠️ DNS: FAILED"
dns_status="FAILED"
fi
if [ "$internet_status" = "FAILED" ]; then
echo "CRITICAL:NETWORK:No internet connectivity"
exit 2
elif [ "$dns_status" = "FAILED" ]; then
echo "WARNING:NETWORK:DNS resolution issues"
exit 1
else
echo "OK:NETWORK:All connectivity normal"
exit 0
fi
register: network_alert
failed_when: false
- name: Evaluate overall alert status
set_fact:
alert_summary:
critical_count: >-
{{
[cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
| selectattr('rc', 'defined')
| selectattr('rc', 'equalto', 2)
| list
| length
}}
warning_count: >-
{{
[cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
| selectattr('rc', 'defined')
| selectattr('rc', 'equalto', 1)
| list
| length
}}
overall_status: >-
{{
'CRITICAL' if (
[cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
| selectattr('rc', 'defined')
| selectattr('rc', 'equalto', 2)
| list
| length > 0
) else 'WARNING' if (
[cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
| selectattr('rc', 'defined')
| selectattr('rc', 'equalto', 1)
| list
| length > 0
) else 'OK'
}}
- name: Generate alert report
shell: |
alert_file="{{ alert_config_dir }}/{{ inventory_hostname }}/alert_report_{{ ansible_date_time.epoch }}.txt"
echo "🚨 INFRASTRUCTURE ALERT REPORT" > "$alert_file"
echo "===============================" >> "$alert_file"
echo "Host: {{ inventory_hostname }}" >> "$alert_file"
echo "Date: {{ ansible_date_time.iso8601 }}" >> "$alert_file"
echo "Overall Status: {{ alert_summary.overall_status }}" >> "$alert_file"
echo "Critical Alerts: {{ alert_summary.critical_count }}" >> "$alert_file"
echo "Warning Alerts: {{ alert_summary.warning_count }}" >> "$alert_file"
echo "" >> "$alert_file"
echo "📊 DETAILED RESULTS:" >> "$alert_file"
echo "===================" >> "$alert_file"
{% for check in ['cpu_alert', 'memory_alert', 'disk_alert', 'load_alert', 'docker_alert', 'services_alert', 'network_alert'] %}
echo "" >> "$alert_file"
echo "{{ check | upper | replace('_ALERT', '') }}:" >> "$alert_file"
echo "{{ hostvars[inventory_hostname][check].stdout | default('No output') }}" >> "$alert_file"
{% endfor %}
echo "Alert report saved to: $alert_file"
register: alert_report
- name: Send NTFY notification for critical alerts
uri:
url: "{{ notifications.ntfy_url }}"
method: POST
body: |
🚨 CRITICAL ALERT: {{ inventory_hostname }}
Status: {{ alert_summary.overall_status }}
Critical: {{ alert_summary.critical_count }}
Warnings: {{ alert_summary.warning_count }}
Time: {{ ansible_date_time.iso8601 }}
headers:
Title: "Homelab Critical Alert"
Priority: "urgent"
Tags: "warning,critical,{{ inventory_hostname }}"
when:
- alert_summary.overall_status == "CRITICAL"
- alert_mode | default(default_alert_mode) != "silent"
- notifications.ntfy_url != ""
ignore_errors: yes
- name: Send NTFY notification for warning alerts
uri:
url: "{{ notifications.ntfy_url }}"
method: POST
body: |
⚠️ WARNING: {{ inventory_hostname }}
Status: {{ alert_summary.overall_status }}
Warnings: {{ alert_summary.warning_count }}
Time: {{ ansible_date_time.iso8601 }}
headers:
Title: "Homelab Warning"
Priority: "default"
Tags: "warning,{{ inventory_hostname }}"
when:
- alert_summary.overall_status == "WARNING"
- alert_mode | default(default_alert_mode) != "silent"
- notifications.ntfy_url != ""
ignore_errors: yes
- name: Send test notification
uri:
url: "{{ notifications.ntfy_url }}"
method: POST
body: |
🧪 TEST ALERT: {{ inventory_hostname }}
This is a test notification from the alert monitoring system.
Status: {{ alert_summary.overall_status }}
Time: {{ ansible_date_time.iso8601 }}
headers:
Title: "Homelab Alert Test"
Priority: "low"
Tags: "test,{{ inventory_hostname }}"
when:
- alert_mode | default(default_alert_mode) == "test"
- notifications.ntfy_url != ""
ignore_errors: yes
- name: Display alert summary
debug:
msg: |
🚨 ALERT MONITORING COMPLETE
============================
🖥️ Host: {{ inventory_hostname }}
📅 Date: {{ ansible_date_time.date }}
🔔 Mode: {{ alert_mode | default(default_alert_mode) }}
📊 ALERT SUMMARY:
Overall Status: {{ alert_summary.overall_status }}
Critical Alerts: {{ alert_summary.critical_count }}
Warning Alerts: {{ alert_summary.warning_count }}
📋 CHECK RESULTS:
{% for check in ['cpu_alert', 'memory_alert', 'disk_alert', 'load_alert', 'docker_alert', 'services_alert', 'network_alert'] %}
{{ check | replace('_alert', '') | upper }}: {{ 'CRITICAL' if hostvars[inventory_hostname][check].rc | default(0) == 2 else 'WARNING' if hostvars[inventory_hostname][check].rc | default(0) == 1 else 'OK' }}
{% endfor %}
{{ alert_report.stdout }}
🔍 Next Steps:
{% if alert_summary.overall_status == "CRITICAL" %}
- 🚨 IMMEDIATE ACTION REQUIRED
- Review critical alerts above
- Check system resources and services
{% elif alert_summary.overall_status == "WARNING" %}
- ⚠️ Monitor system closely
- Consider preventive maintenance
{% else %}
- ✅ System is healthy
- Continue regular monitoring
{% endif %}
- Schedule regular checks: crontab -e
- View full report: cat {{ alert_config_dir }}/{{ inventory_hostname }}/alert_report_*.txt
============================