# Prometheus Alerting Rules for Homelab Infrastructure groups: - name: host-availability interval: 30s rules: - alert: HostDown expr: up{job=~".*-node"} == 0 for: 2m labels: severity: critical annotations: summary: "Host {{ $labels.instance }} is down" description: "Host {{ $labels.instance }} has been unreachable for more than 2 minutes." - alert: HostHighLoadAverage expr: node_load15 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2 for: 10m labels: severity: warning annotations: summary: "High load average on {{ $labels.instance }}" description: "15-minute load average is {{ $value | printf \"%.2f\" }} on {{ $labels.instance }}." - name: cpu-alerts interval: 30s rules: - alert: REDACTED_APP_PASSWORD expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "High CPU usage on {{ $labels.instance }}" description: "CPU usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}." - alert: HostCriticalCpuUsage expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95 for: 5m labels: severity: critical annotations: summary: "🔥 CRITICAL CPU on {{ $labels.instance }}" description: "CPU usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}. Immediate attention required!" - name: memory-alerts interval: 30s rules: - alert: HostHighMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 for: 5m labels: severity: warning annotations: summary: "High memory usage on {{ $labels.instance }}" description: "Memory usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}." - alert: HostCriticalMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95 for: 5m labels: severity: critical annotations: summary: "🔥 CRITICAL Memory on {{ $labels.instance }}" description: "Memory usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}." - alert: HostOutOfMemory expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5 for: 2m labels: severity: critical annotations: summary: "💀 OUT OF MEMORY on {{ $labels.instance }}" description: "Only {{ $value | printf \"%.1f\" }}% memory available on {{ $labels.instance }}." - name: disk-alerts interval: 60s rules: - alert: HostHighDiskUsage expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 80 for: 5m labels: severity: warning annotations: summary: "Disk space warning on {{ $labels.instance }}" description: "Disk {{ $labels.mountpoint }} is {{ $value | printf \"%.1f\" }}% full on {{ $labels.instance }}." - alert: HostCriticalDiskUsage expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 90 for: 5m labels: severity: critical annotations: summary: "🔥 CRITICAL Disk space on {{ $labels.instance }}" description: "Disk {{ $labels.mountpoint }} is {{ $value | printf \"%.1f\" }}% full on {{ $labels.instance }}." - alert: HostDiskWillFillIn24Hours expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"}[6h], 24*60*60) < 0 for: 30m labels: severity: warning annotations: summary: "Disk {{ $labels.mountpoint }} will fill within 24 hours" description: "Based on current growth rate, disk on {{ $labels.instance }} will be full within 24 hours." - alert: REDACTED_APP_PASSWORD expr: node_filesystem_readonly{fstype!~"tmpfs|overlay"} == 1 for: 1m labels: severity: critical annotations: summary: "🔥 Filesystem is read-only on {{ $labels.instance }}" description: "Filesystem {{ $labels.mountpoint }} has become read-only. This usually indicates disk failure!" - name: network-alerts interval: 30s rules: - alert: HostNetworkReceiveErrors expr: rate(node_network_receive_errs_total{device!~"lo|veth.*|docker.*|br-.*"}[5m]) > 10 for: 5m labels: severity: warning annotations: summary: "Network receive errors on {{ $labels.instance }}" description: "{{ $labels.device }} has {{ $value | printf \"%.0f\" }} receive errors/sec." - alert: HostNetworkTransmitErrors expr: rate(node_network_transmit_errs_total{device!~"lo|veth.*|docker.*|br-.*"}[5m]) > 10 for: 5m labels: severity: warning annotations: summary: "Network transmit errors on {{ $labels.instance }}" description: "{{ $labels.device }} has {{ $value | printf \"%.0f\" }} transmit errors/sec." - name: system-alerts interval: 60s rules: - alert: HostClockSkew expr: abs(node_timex_offset_seconds) > 0.5 for: 5m labels: severity: warning annotations: summary: "Clock skew detected on {{ $labels.instance }}" description: "Clock is off by {{ $value | printf \"%.2f\" }} seconds."