homelab-optimized/alerting/alert-rules.yml

# Prometheus Alerting Rules for Homelab Infrastructure

groups:
  - name: host-availability
    interval: 30s
    rules:
      - alert: HostDown
        expr: up{job=~".*-node"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Host {{ $labels.instance }} is down"
          description: "Host {{ $labels.instance }} has been unreachable for more than 2 minutes."

      - alert: HostHighLoadAverage
        expr: node_load15 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High load average on {{ $labels.instance }}"
          description: "15-minute load average is {{ $value | printf \"%.2f\" }} on {{ $labels.instance }}."

  - name: cpu-alerts
    interval: 30s
    rules:
      - alert: REDACTED_APP_PASSWORD
        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage on {{ $labels.instance }}"
          description: "CPU usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}."

      - alert: HostCriticalCpuUsage
        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "🔥 CRITICAL CPU on {{ $labels.instance }}"
          description: "CPU usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}. Immediate attention required!"

  - name: memory-alerts
    interval: 30s
    rules:
      - alert: HostHighMemoryUsage
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage on {{ $labels.instance }}"
          description: "Memory usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}."

      - alert: HostCriticalMemoryUsage
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "🔥 CRITICAL Memory on {{ $labels.instance }}"
          description: "Memory usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}."

      - alert: HostOutOfMemory
        expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "💀 OUT OF MEMORY on {{ $labels.instance }}"
          description: "Only {{ $value | printf \"%.1f\" }}% memory available on {{ $labels.instance }}."

  - name: disk-alerts
    interval: 60s
    rules:
      - alert: HostHighDiskUsage
        expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Disk space warning on {{ $labels.instance }}"
          description: "Disk {{ $labels.mountpoint }} is {{ $value | printf \"%.1f\" }}% full on {{ $labels.instance }}."

      - alert: HostCriticalDiskUsage
        expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 90
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "🔥 CRITICAL Disk space on {{ $labels.instance }}"
          description: "Disk {{ $labels.mountpoint }} is {{ $value | printf \"%.1f\" }}% full on {{ $labels.instance }}."

      - alert: HostDiskWillFillIn24Hours
        expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"}[6h], 24*60*60) < 0
        for: 30m
        labels:
          severity: warning
        annotations:
          summary: "Disk {{ $labels.mountpoint }} will fill within 24 hours"
          description: "Based on current growth rate, disk on {{ $labels.instance }} will be full within 24 hours."

      - alert: REDACTED_APP_PASSWORD
        expr: node_filesystem_readonly{fstype!~"tmpfs|overlay"} == 1
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "🔥 Filesystem is read-only on {{ $labels.instance }}"
          description: "Filesystem {{ $labels.mountpoint }} has become read-only. This usually indicates disk failure!"

  - name: network-alerts
    interval: 30s
    rules:
      - alert: HostNetworkReceiveErrors
        expr: rate(node_network_receive_errs_total{device!~"lo|veth.*|docker.*|br-.*"}[5m]) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Network receive errors on {{ $labels.instance }}"
          description: "{{ $labels.device }} has {{ $value | printf \"%.0f\" }} receive errors/sec."

      - alert: HostNetworkTransmitErrors
        expr: rate(node_network_transmit_errs_total{device!~"lo|veth.*|docker.*|br-.*"}[5m]) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Network transmit errors on {{ $labels.instance }}"
          description: "{{ $labels.device }} has {{ $value | printf \"%.0f\" }} transmit errors/sec."

  - name: system-alerts
    interval: 60s
    rules:
      - alert: HostClockSkew
        expr: abs(node_timex_offset_seconds) > 0.5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Clock skew detected on {{ $labels.instance }}"
          description: "Clock is off by {{ $value | printf \"%.2f\" }} seconds."