Sanitized mirror from private repository - 2026-04-16 07:04:43 UTC
This commit is contained in:
146
docs/infrastructure/monitoring/prometheus/alert-rules.yml
Normal file
146
docs/infrastructure/monitoring/prometheus/alert-rules.yml
Normal file
@@ -0,0 +1,146 @@
|
||||
# Prometheus Alerting Rules for Homelab Infrastructure
|
||||
|
||||
groups:
|
||||
- name: host-availability
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HostDown
|
||||
expr: up{job=~".*-node"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Host {{ $labels.instance }} is down"
|
||||
description: "Host {{ $labels.instance }} has been unreachable for more than 2 minutes."
|
||||
|
||||
- alert: HostHighLoadAverage
|
||||
expr: node_load15 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High load average on {{ $labels.instance }}"
|
||||
description: "15-minute load average is {{ $value | printf \"%.2f\" }} on {{ $labels.instance }}."
|
||||
|
||||
- name: cpu-alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: REDACTED_APP_PASSWORD
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage on {{ $labels.instance }}"
|
||||
description: "CPU usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}."
|
||||
|
||||
- alert: HostCriticalCpuUsage
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "🔥 CRITICAL CPU on {{ $labels.instance }}"
|
||||
description: "CPU usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}. Immediate attention required!"
|
||||
|
||||
- name: memory-alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HostHighMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage on {{ $labels.instance }}"
|
||||
description: "Memory usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}."
|
||||
|
||||
- alert: HostCriticalMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "🔥 CRITICAL Memory on {{ $labels.instance }}"
|
||||
description: "Memory usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}."
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "💀 OUT OF MEMORY on {{ $labels.instance }}"
|
||||
description: "Only {{ $value | printf \"%.1f\" }}% memory available on {{ $labels.instance }}."
|
||||
|
||||
- name: disk-alerts
|
||||
interval: 60s
|
||||
rules:
|
||||
- alert: HostHighDiskUsage
|
||||
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk space warning on {{ $labels.instance }}"
|
||||
description: "Disk {{ $labels.mountpoint }} is {{ $value | printf \"%.1f\" }}% full on {{ $labels.instance }}."
|
||||
|
||||
- alert: HostCriticalDiskUsage
|
||||
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "🔥 CRITICAL Disk space on {{ $labels.instance }}"
|
||||
description: "Disk {{ $labels.mountpoint }} is {{ $value | printf \"%.1f\" }}% full on {{ $labels.instance }}."
|
||||
|
||||
- alert: HostDiskWillFillIn24Hours
|
||||
expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"}[6h], 24*60*60) < 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk {{ $labels.mountpoint }} will fill within 24 hours"
|
||||
description: "Based on current growth rate, disk on {{ $labels.instance }} will be full within 24 hours."
|
||||
|
||||
- alert: REDACTED_APP_PASSWORD
|
||||
expr: node_filesystem_readonly{fstype!~"tmpfs|overlay"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "🔥 Filesystem is read-only on {{ $labels.instance }}"
|
||||
description: "Filesystem {{ $labels.mountpoint }} has become read-only. This usually indicates disk failure!"
|
||||
|
||||
- name: network-alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr: rate(node_network_receive_errs_total{device!~"lo|veth.*|docker.*|br-.*"}[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Network receive errors on {{ $labels.instance }}"
|
||||
description: "{{ $labels.device }} has {{ $value | printf \"%.0f\" }} receive errors/sec."
|
||||
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr: rate(node_network_transmit_errs_total{device!~"lo|veth.*|docker.*|br-.*"}[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Network transmit errors on {{ $labels.instance }}"
|
||||
description: "{{ $labels.device }} has {{ $value | printf \"%.0f\" }} transmit errors/sec."
|
||||
|
||||
- name: system-alerts
|
||||
interval: 60s
|
||||
rules:
|
||||
- alert: HostClockSkew
|
||||
expr: abs(node_timex_offset_seconds) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Clock skew detected on {{ $labels.instance }}"
|
||||
description: "Clock is off by {{ $value | printf \"%.2f\" }} seconds."
|
||||
117
docs/infrastructure/monitoring/prometheus/prometheus.yml
Normal file
117
docs/infrastructure/monitoring/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,117 @@
|
||||
# Updated Prometheus Configuration with Alertmanager
|
||||
# This adds alerting configuration to your existing prometheus.yml
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s # How often to evaluate rules
|
||||
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
# Load alerting rules
|
||||
rule_files:
|
||||
- /etc/prometheus/alert-rules.yml
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ["prometheus:9090"]
|
||||
|
||||
- job_name: "alertmanager"
|
||||
static_configs:
|
||||
- targets: ["alertmanager:9093"]
|
||||
|
||||
- job_name: "homelab-node"
|
||||
static_configs:
|
||||
- targets: ["100.67.40.126:9100"]
|
||||
|
||||
- job_name: "raspberry-pis"
|
||||
static_configs:
|
||||
- targets: ["100.77.151.40:9100"] # pi-5
|
||||
- targets: ["100.123.246.75:9100"] # pi-5-kevin
|
||||
|
||||
- job_name: "setillo-node"
|
||||
static_configs:
|
||||
- targets: ["100.125.0.20:9100"]
|
||||
|
||||
- job_name: "setillo-snmp"
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [synology]
|
||||
auth: [snmpv3]
|
||||
target: ["127.0.0.1"]
|
||||
static_configs:
|
||||
- targets: ["100.125.0.20:9116"]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
replacement: "127.0.0.1"
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
replacement: "100.125.0.20"
|
||||
- target_label: __address__
|
||||
replacement: "100.125.0.20:9116"
|
||||
|
||||
- job_name: "calypso-node"
|
||||
static_configs:
|
||||
- targets: ["100.103.48.78:9100"]
|
||||
|
||||
- job_name: "calypso-snmp"
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [synology]
|
||||
auth: [snmpv3]
|
||||
target: ["127.0.0.1"]
|
||||
static_configs:
|
||||
- targets: ["100.103.48.78:9116"]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
replacement: "127.0.0.1"
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
replacement: "100.103.48.78"
|
||||
- target_label: __address__
|
||||
replacement: "100.103.48.78:9116"
|
||||
|
||||
- job_name: "atlantis-node"
|
||||
static_configs:
|
||||
- targets: ["100.83.230.112:9100"]
|
||||
|
||||
- job_name: "atlantis-snmp"
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [synology]
|
||||
auth: [snmpv3]
|
||||
target: ["127.0.0.1"]
|
||||
static_configs:
|
||||
- targets: ["100.83.230.112:9116"]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
replacement: "127.0.0.1"
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
replacement: "100.83.230.112"
|
||||
- target_label: __address__
|
||||
replacement: "100.83.230.112:9116"
|
||||
|
||||
- job_name: "concord-nuc-node"
|
||||
static_configs:
|
||||
- targets: ["100.72.55.21:9100"]
|
||||
|
||||
- job_name: "truenas-node"
|
||||
static_configs:
|
||||
- targets: ["100.75.252.64:9100"]
|
||||
|
||||
- job_name: "vmi2076105-node"
|
||||
static_configs:
|
||||
- targets: ["100.99.156.20:9100"]
|
||||
|
||||
- job_name: "proxmox-node"
|
||||
static_configs:
|
||||
- targets: ["100.87.12.28:9100"]
|
||||
Reference in New Issue
Block a user