Sanitized mirror from private repository - 2026-04-19 08:46:29 UTC
Some checks failed
Documentation / Build Docusaurus (push) Failing after 17m32s
Documentation / Deploy to GitHub Pages (push) Has been skipped

This commit is contained in:
Gitea Mirror Bot
2026-04-19 08:46:29 +00:00
commit 11d496f233
1439 changed files with 363180 additions and 0 deletions

146
alerting/alert-rules.yml Normal file
View File

@@ -0,0 +1,146 @@
# Prometheus Alerting Rules for Homelab Infrastructure
groups:
- name: host-availability
interval: 30s
rules:
- alert: HostDown
expr: up{job=~".*-node"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Host {{ $labels.instance }} is down"
description: "Host {{ $labels.instance }} has been unreachable for more than 2 minutes."
- alert: HostHighLoadAverage
expr: node_load15 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2
for: 10m
labels:
severity: warning
annotations:
summary: "High load average on {{ $labels.instance }}"
description: "15-minute load average is {{ $value | printf \"%.2f\" }} on {{ $labels.instance }}."
- name: cpu-alerts
interval: 30s
rules:
- alert: REDACTED_APP_PASSWORD
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}."
- alert: HostCriticalCpuUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
for: 5m
labels:
severity: critical
annotations:
summary: "🔥 CRITICAL CPU on {{ $labels.instance }}"
description: "CPU usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}. Immediate attention required!"
- name: memory-alerts
interval: 30s
rules:
- alert: HostHighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}."
- alert: HostCriticalMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 5m
labels:
severity: critical
annotations:
summary: "🔥 CRITICAL Memory on {{ $labels.instance }}"
description: "Memory usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}."
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5
for: 2m
labels:
severity: critical
annotations:
summary: "💀 OUT OF MEMORY on {{ $labels.instance }}"
description: "Only {{ $value | printf \"%.1f\" }}% memory available on {{ $labels.instance }}."
- name: disk-alerts
interval: 60s
rules:
- alert: HostHighDiskUsage
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space warning on {{ $labels.instance }}"
description: "Disk {{ $labels.mountpoint }} is {{ $value | printf \"%.1f\" }}% full on {{ $labels.instance }}."
- alert: HostCriticalDiskUsage
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 90
for: 5m
labels:
severity: critical
annotations:
summary: "🔥 CRITICAL Disk space on {{ $labels.instance }}"
description: "Disk {{ $labels.mountpoint }} is {{ $value | printf \"%.1f\" }}% full on {{ $labels.instance }}."
- alert: HostDiskWillFillIn24Hours
expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"}[6h], 24*60*60) < 0
for: 30m
labels:
severity: warning
annotations:
summary: "Disk {{ $labels.mountpoint }} will fill within 24 hours"
description: "Based on current growth rate, disk on {{ $labels.instance }} will be full within 24 hours."
- alert: REDACTED_APP_PASSWORD
expr: node_filesystem_readonly{fstype!~"tmpfs|overlay"} == 1
for: 1m
labels:
severity: critical
annotations:
summary: "🔥 Filesystem is read-only on {{ $labels.instance }}"
description: "Filesystem {{ $labels.mountpoint }} has become read-only. This usually indicates disk failure!"
- name: network-alerts
interval: 30s
rules:
- alert: HostNetworkReceiveErrors
expr: rate(node_network_receive_errs_total{device!~"lo|veth.*|docker.*|br-.*"}[5m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Network receive errors on {{ $labels.instance }}"
description: "{{ $labels.device }} has {{ $value | printf \"%.0f\" }} receive errors/sec."
- alert: HostNetworkTransmitErrors
expr: rate(node_network_transmit_errs_total{device!~"lo|veth.*|docker.*|br-.*"}[5m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Network transmit errors on {{ $labels.instance }}"
description: "{{ $labels.device }} has {{ $value | printf \"%.0f\" }} transmit errors/sec."
- name: system-alerts
interval: 60s
rules:
- alert: HostClockSkew
expr: abs(node_timex_offset_seconds) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "Clock skew detected on {{ $labels.instance }}"
description: "Clock is off by {{ $value | printf \"%.2f\" }} seconds."

View File

@@ -0,0 +1,49 @@
# Alertmanager Configuration for Homelab
# Routes alerts to both ntfy (via bridge) and Signal
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'severity', 'instance']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: 'ntfy-all'
routes:
# Critical alerts go to both Signal AND ntfy
- match:
severity: critical
receiver: 'critical-alerts'
continue: false
# Warning alerts go to ntfy only
- match:
severity: warning
receiver: 'ntfy-all'
receivers:
# ntfy receiver for all alerts (via bridge for nice formatting)
- name: 'ntfy-all'
webhook_configs:
- url: 'http://ntfy-bridge:5001/alert'
send_resolved: true
# Critical alerts: Signal + ntfy
- name: 'critical-alerts'
webhook_configs:
# ntfy via bridge (formatted nicely)
- url: 'http://ntfy-bridge:5001/alert'
send_resolved: true
# Signal via bridge service
- url: 'http://signal-bridge:5000/alert'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']

View File

@@ -0,0 +1,68 @@
# Alerting Stack for Homelab
services:
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
restart: unless-stopped
ports:
- "9093:9093"
volumes:
- ./alertmanager:/etc/alertmanager
- alertmanager-data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://localhost:9093'
networks:
- monitoring-stack_default
- signal-api-stack_default
- ntfy-stack_default
signal-bridge:
build: ./signal-bridge
container_name: signal-bridge
restart: unless-stopped
ports:
- "5000:5000"
environment:
- SIGNAL_API_URL=http://signal-api:8080
- SIGNAL_SENDER=REDACTED_PHONE_NUMBER
- SIGNAL_RECIPIENTS=REDACTED_PHONE_NUMBER
networks:
- monitoring-stack_default
- signal-api-stack_default
healthcheck:
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health')"]
interval: 30s
timeout: 10s
retries: 3
ntfy-bridge:
build: ./ntfy-bridge
container_name: ntfy-bridge
restart: unless-stopped
ports:
- "5001:5001"
environment:
- NTFY_URL=http://NTFY:80
- NTFY_TOPIC="REDACTED_NTFY_TOPIC"
networks:
- monitoring-stack_default
- ntfy-stack_default
healthcheck:
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')"]
interval: 30s
timeout: 10s
retries: 3
volumes:
alertmanager-data:
networks:
monitoring-stack_default:
external: true
signal-api-stack_default:
external: true
ntfy-stack_default:
external: true

View File

@@ -0,0 +1,5 @@
FROM python:3.11-slim
WORKDIR /app
RUN pip install --no-cache-dir flask requests gunicorn
COPY app.py .
CMD ["gunicorn", "--bind", "0.0.0.0:5001", "--workers", "2", "app:app"]

104
alerting/ntfy-bridge/app.py Normal file
View File

@@ -0,0 +1,104 @@
from flask import Flask, request, jsonify
import requests
import os
app = Flask(__name__)
NTFY_URL = os.environ.get('NTFY_URL', 'http://NTFY:80')
NTFY_TOPIC = os.environ.get('NTFY_TOPIC', 'homelab-alerts')
def get_status_icon(severity, status):
if status == 'resolved':
return 'white_check_mark'
if severity == 'critical':
return 'rotating_light'
return 'warning'
def get_priority(severity, status):
if status == 'resolved':
return '3'
if severity == 'critical':
return '5'
return '4'
def format_alert(alert):
status = alert.get('status', 'firing')
labels = alert.get('labels', {})
annotations = alert.get('annotations', {})
alertname = labels.get('alertname', 'Unknown Alert')
severity = labels.get('severity', 'warning')
instance = labels.get('instance', 'unknown')
status_text = 'RESOLVED' if status == 'resolved' else 'FIRING'
title = f"{alertname} [{status_text}]"
summary = annotations.get('summary', '')
description = annotations.get('description', '')
body_parts = []
if summary:
body_parts.append(summary)
if description and description != summary:
body_parts.append(description)
if instance and instance != 'unknown':
body_parts.append(f"Host: {instance}")
body = '\n'.join(body_parts) if body_parts else f"Alert {status_text.lower()} on {instance}"
return title, body, severity, status
@app.route('/alert', methods=['POST'])
def handle_alert():
try:
data = request.json
alerts = data.get('alerts', [])
for alert in alerts:
title, body, severity, status = format_alert(alert)
priority = get_priority(severity, status)
tag = get_status_icon(severity, status)
response = requests.post(
f"{NTFY_URL}/{NTFY_TOPIC}",
data=body,
headers={
'Title': title,
'Priority': priority,
'Tags': tag
}
)
if response.status_code not in [200, 201]:
print(f"Failed to send to ntfy: {response.status_code} - {response.text}")
return jsonify({'status': 'sent', 'count': len(alerts)})
except Exception as e:
print(f"Error: {e}")
return jsonify({'status': 'error', 'message': str(e)}), 500
@app.route('/health', methods=['GET'])
def health():
return jsonify({'status': 'healthy'})
@app.route('/test', methods=['POST'])
def test():
try:
data = request.json or {}
message = data.get('message', 'Test notification from ntfy-bridge')
response = requests.post(
f"{NTFY_URL}/{NTFY_TOPIC}",
data=message,
headers={
'Title': 'Test Alert',
'Priority': '4',
'Tags': 'test_tube'
}
)
return jsonify({'status': 'sent'})
except Exception as e:
return jsonify({'status': 'error', 'message': str(e)}), 500
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5001)

View File

@@ -0,0 +1,11 @@
FROM python:3.11-slim
WORKDIR /app
RUN pip install --no-cache-dir flask requests gunicorn
COPY app.py .
EXPOSE 5000
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "2", "--timeout", "60", "app:app"]

View File

@@ -0,0 +1,130 @@
#!/usr/bin/env python3
"""
Signal Bridge for Alertmanager
Receives webhooks from Alertmanager and forwards to Signal API
"""
import os
import json
import requests
from flask import Flask, request, jsonify
app = Flask(__name__)
# Configuration from environment variables
SIGNAL_API_URL = os.environ.get('SIGNAL_API_URL', 'http://signal-api:8080')
SIGNAL_SENDER = os.environ.get('SIGNAL_SENDER', '') # Your Signal number
SIGNAL_RECIPIENTS = os.environ.get('SIGNAL_RECIPIENTS', '').split(',') # Comma-separated
def format_alert_message(alert_data):
"""Format Alertmanager webhook payload into a readable message"""
messages = []
status = alert_data.get('status', 'unknown')
for alert in alert_data.get('alerts', []):
alert_status = alert.get('status', status)
labels = alert.get('labels', {})
annotations = alert.get('annotations', {})
severity = labels.get('severity', 'unknown')
alertname = labels.get('alertname', 'Unknown Alert')
instance = labels.get('instance', 'unknown')
summary = annotations.get('summary', alertname)
description = annotations.get('description', '')
# Status emoji
if alert_status == 'resolved':
status_emoji = ''
status_text = 'RESOLVED'
elif severity == 'critical':
status_emoji = '🚨'
status_text = 'CRITICAL'
else:
status_emoji = '⚠️'
status_text = 'WARNING'
msg = f"{status_emoji} [{status_text}] {summary}"
if description:
msg += f"\n{description}"
messages.append(msg)
return "\n\n".join(messages)
def send_signal_message(message):
"""Send message via Signal API"""
if not SIGNAL_SENDER or not SIGNAL_RECIPIENTS:
app.logger.error("Signal sender or recipients not configured")
return False
success = True
for recipient in SIGNAL_RECIPIENTS:
recipient = recipient.strip()
if not recipient:
continue
try:
payload = {
"message": message,
"number": SIGNAL_SENDER,
"recipients": [recipient]
}
response = requests.post(
f"{SIGNAL_API_URL}/v2/send",
json=payload,
timeout=30
)
if response.status_code in [200, 201]:
app.logger.info(f"Message sent to {recipient}")
else:
app.logger.error(f"Failed to send to {recipient}: {response.status_code} - {response.text}")
success = False
except Exception as e:
app.logger.error(f"Error sending to {recipient}: {e}")
success = False
return success
@app.route('/health', methods=['GET'])
def health():
return jsonify({"status": "healthy"}), 200
@app.route('/alert', methods=['POST'])
def receive_alert():
"""Receive alert from Alertmanager and forward to Signal"""
try:
alert_data = request.get_json()
if not alert_data:
return jsonify({"error": "No data received"}), 400
app.logger.info(f"Received alert: {json.dumps(alert_data, indent=2)}")
message = format_alert_message(alert_data)
if send_signal_message(message):
return jsonify({"status": "sent"}), 200
else:
return jsonify({"status": "partial_failure"}), 207
except Exception as e:
app.logger.error(f"Error processing alert: {e}")
return jsonify({"error": str(e)}), 500
@app.route('/test', methods=['POST'])
def test_message():
"""Send a test message"""
message = request.json.get('message', '🧪 Test alert from Signal Bridge')
if send_signal_message(message):
return jsonify({"status": "sent"}), 200
else:
return jsonify({"status": "failed"}), 500
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)