Sanitized mirror from private repository - 2026-04-05 08:31:50 UTC
This commit is contained in:
146
alerting/alert-rules.yml
Normal file
146
alerting/alert-rules.yml
Normal file
@@ -0,0 +1,146 @@
|
||||
# Prometheus Alerting Rules for Homelab Infrastructure
|
||||
|
||||
groups:
|
||||
- name: host-availability
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HostDown
|
||||
expr: up{job=~".*-node"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Host {{ $labels.instance }} is down"
|
||||
description: "Host {{ $labels.instance }} has been unreachable for more than 2 minutes."
|
||||
|
||||
- alert: HostHighLoadAverage
|
||||
expr: node_load15 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High load average on {{ $labels.instance }}"
|
||||
description: "15-minute load average is {{ $value | printf \"%.2f\" }} on {{ $labels.instance }}."
|
||||
|
||||
- name: cpu-alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: REDACTED_APP_PASSWORD
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage on {{ $labels.instance }}"
|
||||
description: "CPU usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}."
|
||||
|
||||
- alert: HostCriticalCpuUsage
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "🔥 CRITICAL CPU on {{ $labels.instance }}"
|
||||
description: "CPU usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}. Immediate attention required!"
|
||||
|
||||
- name: memory-alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HostHighMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage on {{ $labels.instance }}"
|
||||
description: "Memory usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}."
|
||||
|
||||
- alert: HostCriticalMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "🔥 CRITICAL Memory on {{ $labels.instance }}"
|
||||
description: "Memory usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}."
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "💀 OUT OF MEMORY on {{ $labels.instance }}"
|
||||
description: "Only {{ $value | printf \"%.1f\" }}% memory available on {{ $labels.instance }}."
|
||||
|
||||
- name: disk-alerts
|
||||
interval: 60s
|
||||
rules:
|
||||
- alert: HostHighDiskUsage
|
||||
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk space warning on {{ $labels.instance }}"
|
||||
description: "Disk {{ $labels.mountpoint }} is {{ $value | printf \"%.1f\" }}% full on {{ $labels.instance }}."
|
||||
|
||||
- alert: HostCriticalDiskUsage
|
||||
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "🔥 CRITICAL Disk space on {{ $labels.instance }}"
|
||||
description: "Disk {{ $labels.mountpoint }} is {{ $value | printf \"%.1f\" }}% full on {{ $labels.instance }}."
|
||||
|
||||
- alert: HostDiskWillFillIn24Hours
|
||||
expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"}[6h], 24*60*60) < 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk {{ $labels.mountpoint }} will fill within 24 hours"
|
||||
description: "Based on current growth rate, disk on {{ $labels.instance }} will be full within 24 hours."
|
||||
|
||||
- alert: REDACTED_APP_PASSWORD
|
||||
expr: node_filesystem_readonly{fstype!~"tmpfs|overlay"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "🔥 Filesystem is read-only on {{ $labels.instance }}"
|
||||
description: "Filesystem {{ $labels.mountpoint }} has become read-only. This usually indicates disk failure!"
|
||||
|
||||
- name: network-alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr: rate(node_network_receive_errs_total{device!~"lo|veth.*|docker.*|br-.*"}[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Network receive errors on {{ $labels.instance }}"
|
||||
description: "{{ $labels.device }} has {{ $value | printf \"%.0f\" }} receive errors/sec."
|
||||
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr: rate(node_network_transmit_errs_total{device!~"lo|veth.*|docker.*|br-.*"}[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Network transmit errors on {{ $labels.instance }}"
|
||||
description: "{{ $labels.device }} has {{ $value | printf \"%.0f\" }} transmit errors/sec."
|
||||
|
||||
- name: system-alerts
|
||||
interval: 60s
|
||||
rules:
|
||||
- alert: HostClockSkew
|
||||
expr: abs(node_timex_offset_seconds) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Clock skew detected on {{ $labels.instance }}"
|
||||
description: "Clock is off by {{ $value | printf \"%.2f\" }} seconds."
|
||||
49
alerting/alertmanager/alertmanager.yml
Normal file
49
alerting/alertmanager/alertmanager.yml
Normal file
@@ -0,0 +1,49 @@
|
||||
# Alertmanager Configuration for Homelab
|
||||
# Routes alerts to both ntfy (via bridge) and Signal
|
||||
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
group_by: ['alertname', 'severity', 'instance']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
receiver: 'ntfy-all'
|
||||
|
||||
routes:
|
||||
# Critical alerts go to both Signal AND ntfy
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'critical-alerts'
|
||||
continue: false
|
||||
|
||||
# Warning alerts go to ntfy only
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'ntfy-all'
|
||||
|
||||
receivers:
|
||||
# ntfy receiver for all alerts (via bridge for nice formatting)
|
||||
- name: 'ntfy-all'
|
||||
webhook_configs:
|
||||
- url: 'http://ntfy-bridge:5001/alert'
|
||||
send_resolved: true
|
||||
|
||||
# Critical alerts: Signal + ntfy
|
||||
- name: 'critical-alerts'
|
||||
webhook_configs:
|
||||
# ntfy via bridge (formatted nicely)
|
||||
- url: 'http://ntfy-bridge:5001/alert'
|
||||
send_resolved: true
|
||||
|
||||
# Signal via bridge service
|
||||
- url: 'http://signal-bridge:5000/alert'
|
||||
send_resolved: true
|
||||
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'instance']
|
||||
68
alerting/docker-compose.alerting.yml
Normal file
68
alerting/docker-compose.alerting.yml
Normal file
@@ -0,0 +1,68 @@
|
||||
# Alerting Stack for Homelab
|
||||
|
||||
services:
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
container_name: alertmanager
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "9093:9093"
|
||||
volumes:
|
||||
- ./alertmanager:/etc/alertmanager
|
||||
- alertmanager-data:/alertmanager
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
- '--web.external-url=http://localhost:9093'
|
||||
networks:
|
||||
- monitoring-stack_default
|
||||
- signal-api-stack_default
|
||||
- ntfy-stack_default
|
||||
|
||||
signal-bridge:
|
||||
build: ./signal-bridge
|
||||
container_name: signal-bridge
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "5000:5000"
|
||||
environment:
|
||||
- SIGNAL_API_URL=http://signal-api:8080
|
||||
- SIGNAL_SENDER=REDACTED_PHONE_NUMBER
|
||||
- SIGNAL_RECIPIENTS=REDACTED_PHONE_NUMBER
|
||||
networks:
|
||||
- monitoring-stack_default
|
||||
- signal-api-stack_default
|
||||
healthcheck:
|
||||
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health')"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
ntfy-bridge:
|
||||
build: ./ntfy-bridge
|
||||
container_name: ntfy-bridge
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "5001:5001"
|
||||
environment:
|
||||
- NTFY_URL=http://NTFY:80
|
||||
- NTFY_TOPIC="REDACTED_NTFY_TOPIC"
|
||||
networks:
|
||||
- monitoring-stack_default
|
||||
- ntfy-stack_default
|
||||
healthcheck:
|
||||
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
volumes:
|
||||
alertmanager-data:
|
||||
|
||||
networks:
|
||||
monitoring-stack_default:
|
||||
external: true
|
||||
signal-api-stack_default:
|
||||
external: true
|
||||
ntfy-stack_default:
|
||||
external: true
|
||||
5
alerting/ntfy-bridge/Dockerfile
Normal file
5
alerting/ntfy-bridge/Dockerfile
Normal file
@@ -0,0 +1,5 @@
|
||||
FROM python:3.11-slim
|
||||
WORKDIR /app
|
||||
RUN pip install --no-cache-dir flask requests gunicorn
|
||||
COPY app.py .
|
||||
CMD ["gunicorn", "--bind", "0.0.0.0:5001", "--workers", "2", "app:app"]
|
||||
104
alerting/ntfy-bridge/app.py
Normal file
104
alerting/ntfy-bridge/app.py
Normal file
@@ -0,0 +1,104 @@
|
||||
from flask import Flask, request, jsonify
|
||||
import requests
|
||||
import os
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
NTFY_URL = os.environ.get('NTFY_URL', 'http://NTFY:80')
|
||||
NTFY_TOPIC = os.environ.get('NTFY_TOPIC', 'homelab-alerts')
|
||||
|
||||
def get_status_icon(severity, status):
|
||||
if status == 'resolved':
|
||||
return 'white_check_mark'
|
||||
if severity == 'critical':
|
||||
return 'rotating_light'
|
||||
return 'warning'
|
||||
|
||||
def get_priority(severity, status):
|
||||
if status == 'resolved':
|
||||
return '3'
|
||||
if severity == 'critical':
|
||||
return '5'
|
||||
return '4'
|
||||
|
||||
def format_alert(alert):
|
||||
status = alert.get('status', 'firing')
|
||||
labels = alert.get('labels', {})
|
||||
annotations = alert.get('annotations', {})
|
||||
|
||||
alertname = labels.get('alertname', 'Unknown Alert')
|
||||
severity = labels.get('severity', 'warning')
|
||||
instance = labels.get('instance', 'unknown')
|
||||
|
||||
status_text = 'RESOLVED' if status == 'resolved' else 'FIRING'
|
||||
title = f"{alertname} [{status_text}]"
|
||||
|
||||
summary = annotations.get('summary', '')
|
||||
description = annotations.get('description', '')
|
||||
|
||||
body_parts = []
|
||||
if summary:
|
||||
body_parts.append(summary)
|
||||
if description and description != summary:
|
||||
body_parts.append(description)
|
||||
if instance and instance != 'unknown':
|
||||
body_parts.append(f"Host: {instance}")
|
||||
|
||||
body = '\n'.join(body_parts) if body_parts else f"Alert {status_text.lower()} on {instance}"
|
||||
|
||||
return title, body, severity, status
|
||||
|
||||
@app.route('/alert', methods=['POST'])
|
||||
def handle_alert():
|
||||
try:
|
||||
data = request.json
|
||||
alerts = data.get('alerts', [])
|
||||
|
||||
for alert in alerts:
|
||||
title, body, severity, status = format_alert(alert)
|
||||
priority = get_priority(severity, status)
|
||||
tag = get_status_icon(severity, status)
|
||||
|
||||
response = requests.post(
|
||||
f"{NTFY_URL}/{NTFY_TOPIC}",
|
||||
data=body,
|
||||
headers={
|
||||
'Title': title,
|
||||
'Priority': priority,
|
||||
'Tags': tag
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code not in [200, 201]:
|
||||
print(f"Failed to send to ntfy: {response.status_code} - {response.text}")
|
||||
|
||||
return jsonify({'status': 'sent', 'count': len(alerts)})
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
return jsonify({'status': 'error', 'message': str(e)}), 500
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
def health():
|
||||
return jsonify({'status': 'healthy'})
|
||||
|
||||
@app.route('/test', methods=['POST'])
|
||||
def test():
|
||||
try:
|
||||
data = request.json or {}
|
||||
message = data.get('message', 'Test notification from ntfy-bridge')
|
||||
|
||||
response = requests.post(
|
||||
f"{NTFY_URL}/{NTFY_TOPIC}",
|
||||
data=message,
|
||||
headers={
|
||||
'Title': 'Test Alert',
|
||||
'Priority': '4',
|
||||
'Tags': 'test_tube'
|
||||
}
|
||||
)
|
||||
return jsonify({'status': 'sent'})
|
||||
except Exception as e:
|
||||
return jsonify({'status': 'error', 'message': str(e)}), 500
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=5001)
|
||||
11
alerting/signal-bridge/Dockerfile
Normal file
11
alerting/signal-bridge/Dockerfile
Normal file
@@ -0,0 +1,11 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN pip install --no-cache-dir flask requests gunicorn
|
||||
|
||||
COPY app.py .
|
||||
|
||||
EXPOSE 5000
|
||||
|
||||
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "2", "--timeout", "60", "app:app"]
|
||||
130
alerting/signal-bridge/app.py
Normal file
130
alerting/signal-bridge/app.py
Normal file
@@ -0,0 +1,130 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Signal Bridge for Alertmanager
|
||||
Receives webhooks from Alertmanager and forwards to Signal API
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
from flask import Flask, request, jsonify
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Configuration from environment variables
|
||||
SIGNAL_API_URL = os.environ.get('SIGNAL_API_URL', 'http://signal-api:8080')
|
||||
SIGNAL_SENDER = os.environ.get('SIGNAL_SENDER', '') # Your Signal number
|
||||
SIGNAL_RECIPIENTS = os.environ.get('SIGNAL_RECIPIENTS', '').split(',') # Comma-separated
|
||||
|
||||
def format_alert_message(alert_data):
|
||||
"""Format Alertmanager webhook payload into a readable message"""
|
||||
messages = []
|
||||
|
||||
status = alert_data.get('status', 'unknown')
|
||||
|
||||
for alert in alert_data.get('alerts', []):
|
||||
alert_status = alert.get('status', status)
|
||||
labels = alert.get('labels', {})
|
||||
annotations = alert.get('annotations', {})
|
||||
|
||||
severity = labels.get('severity', 'unknown')
|
||||
alertname = labels.get('alertname', 'Unknown Alert')
|
||||
instance = labels.get('instance', 'unknown')
|
||||
|
||||
summary = annotations.get('summary', alertname)
|
||||
description = annotations.get('description', '')
|
||||
|
||||
# Status emoji
|
||||
if alert_status == 'resolved':
|
||||
status_emoji = '✅'
|
||||
status_text = 'RESOLVED'
|
||||
elif severity == 'critical':
|
||||
status_emoji = '🚨'
|
||||
status_text = 'CRITICAL'
|
||||
else:
|
||||
status_emoji = '⚠️'
|
||||
status_text = 'WARNING'
|
||||
|
||||
msg = f"{status_emoji} [{status_text}] {summary}"
|
||||
if description:
|
||||
msg += f"\n{description}"
|
||||
|
||||
messages.append(msg)
|
||||
|
||||
return "\n\n".join(messages)
|
||||
|
||||
def send_signal_message(message):
|
||||
"""Send message via Signal API"""
|
||||
if not SIGNAL_SENDER or not SIGNAL_RECIPIENTS:
|
||||
app.logger.error("Signal sender or recipients not configured")
|
||||
return False
|
||||
|
||||
success = True
|
||||
for recipient in SIGNAL_RECIPIENTS:
|
||||
recipient = recipient.strip()
|
||||
if not recipient:
|
||||
continue
|
||||
|
||||
try:
|
||||
payload = {
|
||||
"message": message,
|
||||
"number": SIGNAL_SENDER,
|
||||
"recipients": [recipient]
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f"{SIGNAL_API_URL}/v2/send",
|
||||
json=payload,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code in [200, 201]:
|
||||
app.logger.info(f"Message sent to {recipient}")
|
||||
else:
|
||||
app.logger.error(f"Failed to send to {recipient}: {response.status_code} - {response.text}")
|
||||
success = False
|
||||
|
||||
except Exception as e:
|
||||
app.logger.error(f"Error sending to {recipient}: {e}")
|
||||
success = False
|
||||
|
||||
return success
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
def health():
|
||||
return jsonify({"status": "healthy"}), 200
|
||||
|
||||
@app.route('/alert', methods=['POST'])
|
||||
def receive_alert():
|
||||
"""Receive alert from Alertmanager and forward to Signal"""
|
||||
try:
|
||||
alert_data = request.get_json()
|
||||
|
||||
if not alert_data:
|
||||
return jsonify({"error": "No data received"}), 400
|
||||
|
||||
app.logger.info(f"Received alert: {json.dumps(alert_data, indent=2)}")
|
||||
|
||||
message = format_alert_message(alert_data)
|
||||
|
||||
if send_signal_message(message):
|
||||
return jsonify({"status": "sent"}), 200
|
||||
else:
|
||||
return jsonify({"status": "partial_failure"}), 207
|
||||
|
||||
except Exception as e:
|
||||
app.logger.error(f"Error processing alert: {e}")
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
@app.route('/test', methods=['POST'])
|
||||
def test_message():
|
||||
"""Send a test message"""
|
||||
message = request.json.get('message', '🧪 Test alert from Signal Bridge')
|
||||
|
||||
if send_signal_message(message):
|
||||
return jsonify({"status": "sent"}), 200
|
||||
else:
|
||||
return jsonify({"status": "failed"}), 500
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=5000)
|
||||
Reference in New Issue
Block a user