homelab-optimized/ansible/automation/playbooks/alert_check.yml

---
# Alert Check and Notification Playbook
# Monitors system conditions and sends alerts when thresholds are exceeded
# Usage: ansible-playbook playbooks/alert_check.yml
# Usage: ansible-playbook playbooks/alert_check.yml -e "alert_mode=test"

- name: Infrastructure Alert Monitoring
  hosts: all
  gather_facts: yes
  vars:
    alert_config_dir: "/tmp/alerts"
    default_alert_mode: "production"  # production, test, silent

    # Alert thresholds
    thresholds:
      cpu:
        warning: 80
        critical: 95
      memory:
        warning: 85
        critical: 95
      disk:
        warning: 85
        critical: 95
      load:
        warning: 4.0
        critical: 8.0
      container_down_critical: 1  # Number of containers down to trigger critical

    # Notification settings
    notifications:
      ntfy_url: "{{ ntfy_url | default('https://ntfy.sh/REDACTED_TOPIC') }}"
      email_enabled: "{{ email_enabled | default(false) }}"
      slack_webhook: "{{ slack_webhook | default('') }}"

  tasks:
    - name: Create alert configuration directory
      file:
        path: "{{ alert_config_dir }}/{{ inventory_hostname }}"
        state: directory
        mode: '0755'

    - name: Display alert monitoring plan
      debug:
        msg: |
          🚨 ALERT MONITORING INITIATED
          =============================
          🖥️  Host: {{ inventory_hostname }}
          📅 Date: {{ ansible_date_time.date }}
          🔔 Mode: {{ alert_mode | default(default_alert_mode) }}
          📊 CPU: {{ thresholds.cpu.warning }}%/{{ thresholds.cpu.critical }}%
          💾 Memory: {{ thresholds.memory.warning }}%/{{ thresholds.memory.critical }}%
          💿 Disk: {{ thresholds.disk.warning }}%/{{ thresholds.disk.critical }}%
          ⚖️  Load: {{ thresholds.load.warning }}/{{ thresholds.load.critical }}

    - name: Check CPU usage with alerting
      shell: |
        cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | awk -F'%' '{print $1}')
        if [ -z "$cpu_usage" ]; then
          cpu_usage=$(vmstat 1 2 | tail -1 | awk '{print 100-$15}')
        fi

        cpu_int=$(echo "$cpu_usage" | cut -d'.' -f1)

        echo "🖥️  CPU Usage: ${cpu_usage}%"

        if [ "$cpu_int" -gt "{{ thresholds.cpu.critical }}" ]; then
          echo "CRITICAL:CPU:${cpu_usage}%"
          exit 2
        elif [ "$cpu_int" -gt "{{ thresholds.cpu.warning }}" ]; then
          echo "WARNING:CPU:${cpu_usage}%"
          exit 1
        else
          echo "OK:CPU:${cpu_usage}%"
          exit 0
        fi
      register: cpu_alert
      failed_when: false

    - name: Check memory usage with alerting
      shell: |
        memory_usage=$(free | awk 'NR==2{printf "%.0f", $3*100/$2}')

        echo "💾 Memory Usage: ${memory_usage}%"

        if [ "$memory_usage" -gt "{{ thresholds.memory.critical }}" ]; then
          echo "CRITICAL:MEMORY:${memory_usage}%"
          exit 2
        elif [ "$memory_usage" -gt "{{ thresholds.memory.warning }}" ]; then
          echo "WARNING:MEMORY:${memory_usage}%"
          exit 1
        else
          echo "OK:MEMORY:${memory_usage}%"
          exit 0
        fi
      register: memory_alert
      failed_when: false

    - name: Check disk usage with alerting
      shell: |
        critical_disks=""
        warning_disks=""

        echo "💿 Disk Usage Check:"
        df -h | awk 'NR>1 {print $5 " " $6}' | while read output; do
          usage=$(echo $output | awk '{print $1}' | sed 's/%//')
          partition=$(echo $output | awk '{print $2}')

          echo "  $partition: ${usage}%"

          if [ "$usage" -gt "{{ thresholds.disk.critical }}" ]; then
            echo "CRITICAL:DISK:$partition:${usage}%"
            echo "$partition:$usage" >> /tmp/critical_disks_$$
          elif [ "$usage" -gt "{{ thresholds.disk.warning }}" ]; then
            echo "WARNING:DISK:$partition:${usage}%"
            echo "$partition:$usage" >> /tmp/warning_disks_$$
          fi
        done

        if [ -f /tmp/critical_disks_$$ ]; then
          echo "Critical disk alerts:"
          cat /tmp/critical_disks_$$
          rm -f /tmp/critical_disks_$$ /tmp/warning_disks_$$
          exit 2
        elif [ -f /tmp/warning_disks_$$ ]; then
          echo "Disk warnings:"
          cat /tmp/warning_disks_$$
          rm -f /tmp/warning_disks_$$
          exit 1
        else
          echo "OK:DISK:All partitions normal"
          exit 0
        fi
      register: disk_alert
      failed_when: false

    - name: Check load average with alerting
      shell: |
        load_avg=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | sed 's/,//')

        echo "⚖️  Load Average (1min): $load_avg"

        # Use bc for floating point comparison if available, otherwise use awk
        if command -v bc &> /dev/null; then
          critical_check=$(echo "$load_avg > {{ thresholds.load.critical }}" | bc -l)
          warning_check=$(echo "$load_avg > {{ thresholds.load.warning }}" | bc -l)
        else
          critical_check=$(awk "BEGIN {print ($load_avg > {{ thresholds.load.critical }})}")
          warning_check=$(awk "BEGIN {print ($load_avg > {{ thresholds.load.warning }})}")
        fi

        if [ "$critical_check" = "1" ]; then
          echo "CRITICAL:LOAD:${load_avg}"
          exit 2
        elif [ "$warning_check" = "1" ]; then
          echo "WARNING:LOAD:${load_avg}"
          exit 1
        else
          echo "OK:LOAD:${load_avg}"
          exit 0
        fi
      register: load_alert
      failed_when: false

    - name: Check Docker container health
      shell: |
        if command -v docker &> /dev/null && docker info &> /dev/null; then
          total_containers=$(docker ps -a -q | wc -l)
          running_containers=$(docker ps -q | wc -l)
          unhealthy_containers=$(docker ps --filter health=unhealthy -q | wc -l)
          stopped_containers=$((total_containers - running_containers))

          echo "🐳 Docker Container Status:"
          echo "  Total: $total_containers"
          echo "  Running: $running_containers"
          echo "  Stopped: $stopped_containers"
          echo "  Unhealthy: $unhealthy_containers"

          if [ "$unhealthy_containers" -gt "0" ] || [ "$stopped_containers" -gt "{{ thresholds.container_down_critical }}" ]; then
            echo "CRITICAL:DOCKER:$stopped_containers stopped, $unhealthy_containers unhealthy"
            exit 2
          elif [ "$stopped_containers" -gt "0" ]; then
            echo "WARNING:DOCKER:$stopped_containers containers stopped"
            exit 1
          else
            echo "OK:DOCKER:All containers healthy"
            exit 0
          fi
        else
          echo "ℹ️  Docker not available - skipping container checks"
          echo "OK:DOCKER:Not installed"
          exit 0
        fi
      register: docker_alert
      failed_when: false

    - name: Check critical services
      shell: |
        critical_services=("ssh" "systemd-resolved")
        failed_services=""

        echo "🔧 Critical Services Check:"

        for service in "${critical_services[@]}"; do
          if systemctl is-active --quiet "$service" 2>/dev/null; then
            echo "  ✅ $service: running"
          else
            echo "  🚨 $service: not running"
            failed_services="$failed_services $service"
          fi
        done

        if [ -n "$failed_services" ]; then
          echo "CRITICAL:SERVICES:$failed_services"
          exit 2
        else
          echo "OK:SERVICES:All critical services running"
          exit 0
        fi
      register: services_alert
      failed_when: false

    - name: Check network connectivity
      shell: |
        echo "🌐 Network Connectivity Check:"

        # Check internet connectivity
        if ping -c 1 -W 5 8.8.8.8 &> /dev/null; then
          echo "  ✅ Internet: OK"
          internet_status="OK"
        else
          echo "  🚨 Internet: FAILED"
          internet_status="FAILED"
        fi

        # Check DNS resolution
        if nslookup google.com &> /dev/null; then
          echo "  ✅ DNS: OK"
          dns_status="OK"
        else
          echo "  ⚠️  DNS: FAILED"
          dns_status="FAILED"
        fi

        if [ "$internet_status" = "FAILED" ]; then
          echo "CRITICAL:NETWORK:No internet connectivity"
          exit 2
        elif [ "$dns_status" = "FAILED" ]; then
          echo "WARNING:NETWORK:DNS resolution issues"
          exit 1
        else
          echo "OK:NETWORK:All connectivity normal"
          exit 0
        fi
      register: network_alert
      failed_when: false

    - name: Evaluate overall alert status
      set_fact:
        alert_summary:
          critical_count: >-
            {{
              [cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
              | selectattr('rc', 'defined')
              | selectattr('rc', 'equalto', 2)
              | list
              | length
            }}
          warning_count: >-
            {{
              [cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
              | selectattr('rc', 'defined')
              | selectattr('rc', 'equalto', 1)
              | list
              | length
            }}
          overall_status: >-
            {{
              'CRITICAL' if (
                [cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
                | selectattr('rc', 'defined')
                | selectattr('rc', 'equalto', 2)
                | list
                | length > 0
              ) else 'WARNING' if (
                [cpu_alert, memory_alert, disk_alert, load_alert, docker_alert, services_alert, network_alert]
                | selectattr('rc', 'defined')
                | selectattr('rc', 'equalto', 1)
                | list
                | length > 0
              ) else 'OK'
            }}

    - name: Generate alert report
      shell: |
        alert_file="{{ alert_config_dir }}/{{ inventory_hostname }}/alert_report_{{ ansible_date_time.epoch }}.txt"

        echo "🚨 INFRASTRUCTURE ALERT REPORT" > "$alert_file"
        echo "===============================" >> "$alert_file"
        echo "Host: {{ inventory_hostname }}" >> "$alert_file"
        echo "Date: {{ ansible_date_time.iso8601 }}" >> "$alert_file"
        echo "Overall Status: {{ alert_summary.overall_status }}" >> "$alert_file"
        echo "Critical Alerts: {{ alert_summary.critical_count }}" >> "$alert_file"
        echo "Warning Alerts: {{ alert_summary.warning_count }}" >> "$alert_file"
        echo "" >> "$alert_file"

        echo "📊 DETAILED RESULTS:" >> "$alert_file"
        echo "===================" >> "$alert_file"
        {% for check in ['cpu_alert', 'memory_alert', 'disk_alert', 'load_alert', 'docker_alert', 'services_alert', 'network_alert'] %}
        echo "" >> "$alert_file"
        echo "{{ check | upper | replace('_ALERT', '') }}:" >> "$alert_file"
        echo "{{ hostvars[inventory_hostname][check].stdout | default('No output') }}" >> "$alert_file"
        {% endfor %}

        echo "Alert report saved to: $alert_file"
      register: alert_report

    - name: Send NTFY notification for critical alerts
      uri:
        url: "{{ notifications.ntfy_url }}"
        method: POST
        body: |
          🚨 CRITICAL ALERT: {{ inventory_hostname }}

          Status: {{ alert_summary.overall_status }}
          Critical: {{ alert_summary.critical_count }}
          Warnings: {{ alert_summary.warning_count }}

          Time: {{ ansible_date_time.iso8601 }}
        headers:
          Title: "Homelab Critical Alert"
          Priority: "urgent"
          Tags: "warning,critical,{{ inventory_hostname }}"
      when:
        - alert_summary.overall_status == "CRITICAL"
        - alert_mode | default(default_alert_mode) != "silent"
        - notifications.ntfy_url != ""
      ignore_errors: yes

    - name: Send NTFY notification for warning alerts
      uri:
        url: "{{ notifications.ntfy_url }}"
        method: POST
        body: |
          ⚠️ WARNING: {{ inventory_hostname }}

          Status: {{ alert_summary.overall_status }}
          Warnings: {{ alert_summary.warning_count }}

          Time: {{ ansible_date_time.iso8601 }}
        headers:
          Title: "Homelab Warning"
          Priority: "default"
          Tags: "warning,{{ inventory_hostname }}"
      when:
        - alert_summary.overall_status == "WARNING"
        - alert_mode | default(default_alert_mode) != "silent"
        - notifications.ntfy_url != ""
      ignore_errors: yes

    - name: Send test notification
      uri:
        url: "{{ notifications.ntfy_url }}"
        method: POST
        body: |
          🧪 TEST ALERT: {{ inventory_hostname }}

          This is a test notification from the alert monitoring system.

          Status: {{ alert_summary.overall_status }}
          Time: {{ ansible_date_time.iso8601 }}
        headers:
          Title: "Homelab Alert Test"
          Priority: "low"
          Tags: "test,{{ inventory_hostname }}"
      when:
        - alert_mode | default(default_alert_mode) == "test"
        - notifications.ntfy_url != ""
      ignore_errors: yes

    - name: Display alert summary
      debug:
        msg: |

          🚨 ALERT MONITORING COMPLETE
          ============================
          🖥️  Host: {{ inventory_hostname }}
          📅 Date: {{ ansible_date_time.date }}
          🔔 Mode: {{ alert_mode | default(default_alert_mode) }}

          📊 ALERT SUMMARY:
          Overall Status: {{ alert_summary.overall_status }}
          Critical Alerts: {{ alert_summary.critical_count }}
          Warning Alerts: {{ alert_summary.warning_count }}

          📋 CHECK RESULTS:
          {% for check in ['cpu_alert', 'memory_alert', 'disk_alert', 'load_alert', 'docker_alert', 'services_alert', 'network_alert'] %}
          {{ check | replace('_alert', '') | upper }}: {{ 'CRITICAL' if hostvars[inventory_hostname][check].rc | default(0) == 2 else 'WARNING' if hostvars[inventory_hostname][check].rc | default(0) == 1 else 'OK' }}
          {% endfor %}

          {{ alert_report.stdout }}

          🔍 Next Steps:
          {% if alert_summary.overall_status == "CRITICAL" %}
          - 🚨 IMMEDIATE ACTION REQUIRED
          - Review critical alerts above
          - Check system resources and services
          {% elif alert_summary.overall_status == "WARNING" %}
          - ⚠️  Monitor system closely
          - Consider preventive maintenance
          {% else %}
          - ✅ System is healthy
          - Continue regular monitoring
          {% endif %}
          - Schedule regular checks: crontab -e
          - View full report: cat {{ alert_config_dir }}/{{ inventory_hostname }}/alert_report_*.txt

          ============================