Sanitized mirror from private repository - 2026-04-19 08:46:29 UTC

2026-04-19 08:46:29 +00:00
commit 11d496f233
1439 changed files with 363180 additions and 0 deletions
--- a/ansible/automation/playbooks/alert_check.yml
+++ b/ansible/automation/playbooks/alert_check.yml
@@ -0,0 +1,418 @@
+---
+# Alert Check and Notification Playbook
+# Monitors system conditions and sends alerts when thresholds are exceeded
+# Usage: ansible-playbook playbooks/alert_check.yml
+# Usage: ansible-playbook playbooks/alert_check.yml -e "alert_mode=test"
+
+- name: Infrastructure Alert Monitoring
+  hosts: all
+  gather_facts: yes
+  vars:
+    alert_config_dir: "/tmp/alerts"
+    default_alert_mode: "production"  # production, test, silent
+
+    # Alert thresholds
+    thresholds:
+      cpu:
+        warning: 80
+        critical: 95
+      memory:
+        warning: 85
+        critical: 95
+      disk:
+        warning: 85
+        critical: 95
+      load:
+        warning: 4.0
+        critical: 8.0
+      container_down_critical: 1  # Number of containers down to trigger critical
+
+    # Notification settings
+    notifications:
+      ntfy_url: "{{ ntfy_url | default('https://ntfy.sh/REDACTED_TOPIC') }}"
+      email_enabled: "{{ email_enabled | default(false) }}"
+      slack_webhook: "{{ slack_webhook | default('') }}"
+
+  tasks:
+    - name: Create alert configuration directory
+      file:
+        path: "{{ alert_config_dir }}/{{ inventory_hostname }}"
+        state: directory
+        mode: '0755'
+
+    - name: Display alert monitoring plan
+      debug:
+        msg: |
+          🚨 ALERT MONITORING INITIATED
+          =============================
+          🖥️  Host: {{ inventory_hostname }}
+          📅 Date: {{ ansible_date_time.date }}
+          🔔 Mode: {{ alert_mode | default(default_alert_mode) }}
+          📊 CPU: {{ thresholds.cpu.warning }}%/{{ thresholds.cpu.critical }}%
+          💾 Memory: {{ thresholds.memory.warning }}%/{{ thresholds.memory.critical }}%
+          💿 Disk: {{ thresholds.disk.warning }}%/{{ thresholds.disk.critical }}%
+          ⚖️  Load: {{ thresholds.load.warning }}/{{ thresholds.load.critical }}
+
+    - name: Check CPU usage with alerting
+      shell: |
+        cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | awk -F'%' '{print $1}')
+        if [ -z "$cpu_usage" ]; then
+          cpu_usage=$(vmstat 1 2 | tail -1 | awk '{print 100-$15}')
+        fi
+
+        cpu_int=$(echo "$cpu_usage" | cut -d'.' -f1)
+
+        echo "🖥️  CPU Usage: ${cpu_usage}%"
+
+        if [ "$cpu_int" -gt "{{ thresholds.cpu.critical }}" ]; then
+          echo "CRITICAL:CPU:${cpu_usage}%"
+          exit 2
+        elif [ "$cpu_int" -gt "{{ thresholds.cpu.warning }}" ]; then
+          echo "WARNING:CPU:${cpu_usage}%"
+          exit 1
+        else
+          echo "OK:CPU:${cpu_usage}%"
+          exit 0
+        fi
+      register: cpu_alert
+      failed_when: false
+
+    - name: Check memory usage with alerting
+      shell: |
+        memory_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')
+
+        echo "💾 Memory Usage: ${memory_usage}%"
+
+        if [ "$memory_usage" -gt "{{ thresholds.memory.critical }}" ]; then
+          echo "CRITICAL:MEMORY:${memory_usage}%"
+          exit 2
+        elif [ "$memory_usage" -gt "{{ thresholds.memory.warning }}" ]; then
+          echo "WARNING:MEMORY:${memory_usage}%"
+          exit 1
+        else
+          echo "OK:MEMORY:${memory_usage}%"
+          exit 0
+        fi
+      register: memory_alert
+      failed_when: false
+
+    - name: Check disk usage with alerting
+      shell: |
+        critical_disks=""
+        warning_disks=""
+
+        echo "💿 Disk Usage Check:"
+        df -h | awk 'NR>1 {print $5 " " $6}' | while read output; do
+          usage=$(echo $output | awk '{print $1}' | sed 's/%//')
+          partition=$(echo $output | awk '{print $2}')
+
+          echo "  $partition: ${usage}%"
+
+          if [ "$usage" -gt "{{ thresholds.disk.critical }}" ]; then
+            echo "CRITICAL:DISK:$partition:${usage}%"
+            echo "$partition:$usage" >> /tmp/critical_disks_$$
+          elif [ "$usage" -gt "{{ thresholds.disk.warning }}" ]; then
+            echo "WARNING:DISK:$partition:${usage}%"
+            echo "$partition:$usage" >> /tmp/warning_disks_$$
+          fi
+        done
+
+        if [ -f /tmp/critical_disks_$$ ]; then
+          echo "Critical disk alerts:"
+          cat /tmp/critical_disks_$$
+          rm -f /tmp/critical_disks_$$ /tmp/warning_disks_$$
+          exit 2
+        elif [ -f /tmp/warning_disks_$$ ]; then
+          echo "Disk warnings:"
+          cat /tmp/warning_disks_$$
+          rm -f /tmp/warning_disks_$$
+          exit 1
+        else
+          echo "OK:DISK:All partitions normal"
+          exit 0
+        fi
+      register: disk_alert
+      failed_when: false
+
+    - name: Check load average with alerting
+      shell: |
+        load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | sed 's/,//')
+
+        echo "⚖️  Load Average (1min): $load_avg"
+
+        # Use bc for floating point comparison if available, otherwise use awk
+        if command -v bc &> /dev/null; then
+          critical_check=$(echo "$load_avg > {{ thresholds.load.critical }}" | bc -l)
+          warning_check=$(echo "$load_avg > {{ thresholds.load.warning }}" | bc -l)
+        else
+          critical_check=$(awk "BEGIN {print ($load_avg > {{ thresholds.load.critical }})}")
+          warning_check=$(awk "BEGIN {print ($load_avg > {{ thresholds.load.warning }})}")
+        fi
+
+        if [ "$critical_check" = "1" ]; then
+          echo "CRITICAL:LOAD:${load_avg}"
+          exit 2
+        elif [ "$warning_check" = "1" ]; then
+          echo "WARNING:LOAD:${load_avg}"
+          exit 1
+        else
+          echo "OK:LOAD:${load_avg}"
+          exit 0
+        fi
+      register: load_alert
+      failed_when: false
+
+    - name: Check Docker container health
+      shell: |
+        if command -v docker &> /dev/null && docker info &> /dev/null; then
+          total_containers=$(docker ps -a -q | wc -l)
+          running_containers=$(docker ps -q | wc -l)
+          unhealthy_containers=$(docker ps --filter health=unhealthy -q | wc -l)
+          stopped_containers=$((total_containers - running_containers))
+
+          echo "🐳 Docker Container Status:"
+          echo "  Total: $total_containers"
+          echo "  Running: $running_containers"
+          echo "  Stopped: $stopped_containers"
+          echo "  Unhealthy: $unhealthy_containers"
+
+          if [ "$unhealthy_containers" -gt "0" ] || [ "$stopped_containers" -gt "{{ thresholds.container_down_critical }}" ]; then
+            echo "CRITICAL:DOCKER:$stopped_containers stopped, $unhealthy_containers unhealthy"
+            exit 2
+          elif [ "$stopped_containers" -gt "0" ]; then
+            echo "WARNING:DOCKER:$stopped_containers containers stopped"
+            exit 1
+          else
+            echo "OK:DOCKER:All containers healthy"
+            exit 0
+          fi
+        else
+          echo "ℹ️  Docker not available - skipping container checks"
+          echo "OK:DOCKER:Not installed"
+          exit 0
+        fi
+      register: docker_alert
+      failed_when: false
+
+    - name: Check critical services
+      shell: |
+        critical_services=("ssh" "systemd-resolved")
+        failed_services=""
+
+        echo "🔧 Critical Services Check:"
+
+        for service in "${critical_services[@]}"; do
+          if systemctl is-active --quiet "$service" 2>/dev/null; then
+            echo "  ✅ $service: running"
+          else
+            echo "  🚨 $service: not running"
+            failed_services="$failed_services $service"
+          fi
+        done
+
+        if [ -n "$failed_services" ]; then
+          echo "CRITICAL:SERVICES:$failed_services"
+          exit 2
+        else
+          echo "OK:SERVICES:All critical services running"
+          exit 0
+        fi
+      register: services_alert
+      failed_when: false
+
+    - name: Check network connectivity
+      shell: |
+        echo "🌐 Network Connectivity Check:"
+
+        # Check internet connectivity
+        if ping -c 1 -W 5 8.8.8.8 &> /dev/null; then
+          echo "  ✅ Internet: OK"
+          internet_status="OK"
+        else
+          echo "  🚨 Internet: FAILED"
+          internet_status="FAILED"
+        fi
+
+        # Check DNS resolution
+        if nslookup google.com &> /dev/null; then
+          echo "  ✅ DNS: OK"
+          dns_status="OK"
+        else
+          echo "  ⚠️  DNS: FAILED"
+          dns_status="FAILED"
+        fi
+
+        if [ "$internet_status" = "FAILED" ]; then
+          echo "CRITICAL:NETWORK:No internet connectivity"
+          exit 2
+        elif [ "$dns_status" = "FAILED" ]; then
+          echo "WARNING:NETWORK:DNS resolution issues"
+          exit 1
+        else
+          echo "OK:NETWORK:All connectivity normal"
+          exit 0
+        fi
+      register: network_alert
+      failed_when: false
+
+    - name: Evaluate overall alert status
+      set_fact:
+        alert_summary:
+          critical_count: >-
+            {{
+              [cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
+              | selectattr('rc', 'defined')
+              | selectattr('rc', 'equalto', 2)
+              | list
+              | length
+            }}
+          warning_count: >-
+            {{
+              [cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
+              | selectattr('rc', 'defined')
+              | selectattr('rc', 'equalto', 1)
+              | list
+              | length
+            }}
+          overall_status: >-
+            {{
+              'CRITICAL' if (
+                [cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
+                | selectattr('rc', 'defined')
+                | selectattr('rc', 'equalto', 2)
+                | list
+                | length > 0
+              ) else 'WARNING' if (
+                [cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
+                | selectattr('rc', 'defined')
+                | selectattr('rc', 'equalto', 1)
+                | list
+                | length > 0
+              ) else 'OK'
+            }}
+
+    - name: Generate alert report
+      shell: |
+        alert_file="{{ alert_config_dir }}/{{ inventory_hostname }}/alert_report_{{ ansible_date_time.epoch }}.txt"
+
+        echo "🚨 INFRASTRUCTURE ALERT REPORT" > "$alert_file"
+        echo "===============================" >> "$alert_file"
+        echo "Host: {{ inventory_hostname }}" >> "$alert_file"
+        echo "Date: {{ ansible_date_time.iso8601 }}" >> "$alert_file"
+        echo "Overall Status: {{ alert_summary.overall_status }}" >> "$alert_file"
+        echo "Critical Alerts: {{ alert_summary.critical_count }}" >> "$alert_file"
+        echo "Warning Alerts: {{ alert_summary.warning_count }}" >> "$alert_file"
+        echo "" >> "$alert_file"
+
+        echo "📊 DETAILED RESULTS:" >> "$alert_file"
+        echo "===================" >> "$alert_file"
+        {% for check in ['cpu_alert', 'memory_alert', 'disk_alert', 'load_alert', 'docker_alert', 'services_alert', 'network_alert'] %}
+        echo "" >> "$alert_file"
+        echo "{{ check | upper | replace('_ALERT', '') }}:" >> "$alert_file"
+        echo "{{ hostvars[inventory_hostname][check].stdout | default('No output') }}" >> "$alert_file"
+        {% endfor %}
+
+        echo "Alert report saved to: $alert_file"
+      register: alert_report
+
+    - name: Send NTFY notification for critical alerts
+      uri:
+        url: "{{ notifications.ntfy_url }}"
+        method: POST
+        body: |
+          🚨 CRITICAL ALERT: {{ inventory_hostname }}
+
+          Status: {{ alert_summary.overall_status }}
+          Critical: {{ alert_summary.critical_count }}
+          Warnings: {{ alert_summary.warning_count }}
+
+          Time: {{ ansible_date_time.iso8601 }}
+        headers:
+          Title: "Homelab Critical Alert"
+          Priority: "urgent"
+          Tags: "warning,critical,{{ inventory_hostname }}"
+      when:
+        - alert_summary.overall_status == "CRITICAL"
+        - alert_mode | default(default_alert_mode) != "silent"
+        - notifications.ntfy_url != ""
+      ignore_errors: yes
+
+    - name: Send NTFY notification for warning alerts
+      uri:
+        url: "{{ notifications.ntfy_url }}"
+        method: POST
+        body: |
+          ⚠️ WARNING: {{ inventory_hostname }}
+
+          Status: {{ alert_summary.overall_status }}
+          Warnings: {{ alert_summary.warning_count }}
+
+          Time: {{ ansible_date_time.iso8601 }}
+        headers:
+          Title: "Homelab Warning"
+          Priority: "default"
+          Tags: "warning,{{ inventory_hostname }}"
+      when:
+        - alert_summary.overall_status == "WARNING"
+        - alert_mode | default(default_alert_mode) != "silent"
+        - notifications.ntfy_url != ""
+      ignore_errors: yes
+
+    - name: Send test notification
+      uri:
+        url: "{{ notifications.ntfy_url }}"
+        method: POST
+        body: |
+          🧪 TEST ALERT: {{ inventory_hostname }}
+
+          This is a test notification from the alert monitoring system.
+
+          Status: {{ alert_summary.overall_status }}
+          Time: {{ ansible_date_time.iso8601 }}
+        headers:
+          Title: "Homelab Alert Test"
+          Priority: "low"
+          Tags: "test,{{ inventory_hostname }}"
+      when:
+        - alert_mode | default(default_alert_mode) == "test"
+        - notifications.ntfy_url != ""
+      ignore_errors: yes
+
+    - name: Display alert summary
+      debug:
+        msg: |
+
+          🚨 ALERT MONITORING COMPLETE
+          ============================
+          🖥️  Host: {{ inventory_hostname }}
+          📅 Date: {{ ansible_date_time.date }}
+          🔔 Mode: {{ alert_mode | default(default_alert_mode) }}
+
+          📊 ALERT SUMMARY:
+          Overall Status: {{ alert_summary.overall_status }}
+          Critical Alerts: {{ alert_summary.critical_count }}
+          Warning Alerts: {{ alert_summary.warning_count }}
+
+          📋 CHECK RESULTS:
+          {% for check in ['cpu_alert', 'memory_alert', 'disk_alert', 'load_alert', 'docker_alert', 'services_alert', 'network_alert'] %}
+          {{ check | replace('_alert', '') | upper }}: {{ 'CRITICAL' if hostvars[inventory_hostname][check].rc | default(0) == 2 else 'WARNING' if hostvars[inventory_hostname][check].rc | default(0) == 1 else 'OK' }}
+          {% endfor %}
+
+          {{ alert_report.stdout }}
+
+          🔍 Next Steps:
+          {% if alert_summary.overall_status == "CRITICAL" %}
+          - 🚨 IMMEDIATE ACTION REQUIRED
+          - Review critical alerts above
+          - Check system resources and services
+          {% elif alert_summary.overall_status == "WARNING" %}
+          - ⚠️  Monitor system closely
+          - Consider preventive maintenance
+          {% else %}
+          - ✅ System is healthy
+          - Continue regular monitoring
+          {% endif %}
+          - Schedule regular checks: crontab -e
+          - View full report: cat {{ alert_config_dir }}/{{ inventory_hostname }}/alert_report_*.txt
+
+          ============================