Files
homelab-optimized/hosts/vms/homelab-vm/alerting.yaml
Gitea Mirror Bot d6eb5dcb1e
Some checks failed
Documentation / Build Docusaurus (push) Failing after 18m8s
Documentation / Deploy to GitHub Pages (push) Has been skipped
Sanitized mirror from private repository - 2026-04-19 07:39:14 UTC
2026-04-19 07:39:14 +00:00

285 lines
9.0 KiB
YAML

# Alerting Stack - Alertmanager + Notification Bridges
# =============================================================================
# Dual-channel alerting: ntfy (mobile push) + Signal (encrypted messaging)
# =============================================================================
# Deployed via: Portainer GitOps
# Ports: 9093 (Alertmanager), 5000 (signal-bridge), 5001 (ntfy-bridge)
#
# Alert Routing:
# - Warning alerts → ntfy only
# - Critical alerts → ntfy + Signal
# - Resolved alerts → Both channels (for critical)
#
# Uses docker configs to embed Python bridge apps since Portainer GitOps
# doesn't support docker build
configs:
# Alertmanager Configuration
alertmanager_config:
content: |
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'severity', 'instance']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: 'ntfy-all'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
continue: false
- match:
severity: warning
receiver: 'ntfy-all'
receivers:
- name: 'ntfy-all'
webhook_configs:
- url: 'http://ntfy-bridge:5001/alert'
send_resolved: true
- name: 'critical-alerts'
webhook_configs:
- url: 'http://ntfy-bridge:5001/alert'
send_resolved: true
- url: 'http://signal-bridge:5000/alert'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
# ntfy-bridge Python App
ntfy_bridge_app:
content: |
from flask import Flask, request, jsonify
import requests
import os
app = Flask(__name__)
NTFY_URL = os.environ.get('NTFY_URL', 'http://NTFY:80')
NTFY_TOPIC = os.environ.get('NTFY_TOPIC', 'homelab-alerts')
def get_priority(severity, status):
if status == 'resolved':
return '3'
if severity == 'critical':
return '5'
return '4'
def get_tag(severity, status):
if status == 'resolved':
return 'white_check_mark'
if severity == 'critical':
return 'rotating_light'
return 'warning'
def format_alert(alert):
status = alert.get('status', 'firing')
labels = alert.get('labels', {})
annotations = alert.get('annotations', {})
alertname = labels.get('alertname', 'Unknown')
severity = labels.get('severity', 'warning')
instance = labels.get('instance', 'unknown')
status_text = 'RESOLVED' if status == 'resolved' else 'FIRING'
title = f"{alertname} [{status_text}]"
summary = annotations.get('summary', '')
description = annotations.get('description', '')
body_parts = []
if summary:
body_parts.append(summary)
if description and description != summary:
body_parts.append(description)
if instance != 'unknown':
body_parts.append(f"Host: {instance}")
body = '\n'.join(body_parts) if body_parts else f"Alert {status_text.lower()}"
return title, body, severity, status
@app.route('/alert', methods=['POST'])
def handle_alert():
try:
data = request.json
for alert in data.get('alerts', []):
title, body, severity, status = format_alert(alert)
requests.post(f"{NTFY_URL}/{NTFY_TOPIC}", data=body,
headers={'Title': title, 'Priority': get_priority(severity, status), 'Tags': get_tag(severity, status)})
return jsonify({'status': 'sent', 'count': len(data.get('alerts', []))})
except Exception as e:
return jsonify({'status': 'error', 'message': str(e)}), 500
@app.route('/health', methods=['GET'])
def health():
return jsonify({'status': 'healthy'})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5001)
# signal-bridge Python App
signal_bridge_app:
content: |
import os
import requests
from flask import Flask, request, jsonify
app = Flask(__name__)
SIGNAL_API_URL = os.environ.get('SIGNAL_API_URL', 'http://signal-api:8080')
SIGNAL_SENDER = os.environ.get('SIGNAL_SENDER', '')
SIGNAL_RECIPIENTS = os.environ.get('SIGNAL_RECIPIENTS', '').split(',')
def format_alert_message(alert_data):
messages = []
for alert in alert_data.get('alerts', []):
status = alert.get('status', 'firing')
labels = alert.get('labels', {})
annotations = alert.get('annotations', {})
severity = labels.get('severity', 'warning')
summary = annotations.get('summary', labels.get('alertname', 'Alert'))
description = annotations.get('description', '')
if status == 'resolved':
emoji, text = '✅', 'RESOLVED'
elif severity == 'critical':
emoji, text = '🚨', 'CRITICAL'
else:
emoji, text = '⚠️', 'WARNING'
msg = f"{emoji} [{text}] {summary}"
if description:
msg += f"\n{description}"
messages.append(msg)
return "\n\n".join(messages)
def send_signal_message(message):
if not SIGNAL_SENDER or not SIGNAL_RECIPIENTS:
return False
success = True
for recipient in SIGNAL_RECIPIENTS:
recipient = recipient.strip()
if not recipient:
continue
try:
response = requests.post(f"{SIGNAL_API_URL}/v2/send", json={
"message": message, "number": SIGNAL_SENDER, "recipients": [recipient]
}, timeout=30)
if response.status_code not in [200, 201]:
success = False
except Exception:
success = False
return success
@app.route('/health', methods=['GET'])
def health():
return jsonify({"status": "healthy"})
@app.route('/alert', methods=['POST'])
def receive_alert():
try:
alert_data = request.get_json()
if not alert_data:
return jsonify({"error": "No data"}), 400
message = format_alert_message(alert_data)
if send_signal_message(message):
return jsonify({"status": "sent"})
return jsonify({"status": "partial_failure"}), 207
except Exception as e:
return jsonify({"error": str(e)}), 500
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
services:
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
restart: unless-stopped
ports:
- "9093:9093"
configs:
- source: alertmanager_config
target: /etc/alertmanager/alertmanager.yml
volumes:
- alertmanager-data:/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://localhost:9093'
networks:
- alerting
- monitoring-stack_monitoring
ntfy-bridge:
image: python:3.11-slim
container_name: ntfy-bridge
restart: unless-stopped
ports:
- "5001:5001"
environment:
- NTFY_URL=http://NTFY:80
- NTFY_TOPIC="REDACTED_NTFY_TOPIC"
configs:
- source: ntfy_bridge_app
target: /app/app.py
command: >
sh -c "pip install --quiet flask requests gunicorn &&
cd /app && gunicorn --bind 0.0.0.0:5001 --workers 2 app:app"
networks:
- alerting
- ntfy-stack_default
healthcheck:
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')"]
interval: 30s
timeout: 10s
retries: 3
signal-bridge:
image: python:3.11-slim
container_name: signal-bridge
restart: unless-stopped
ports:
- "5000:5000"
environment:
- SIGNAL_API_URL=http://signal-api:8080
- SIGNAL_SENDER=REDACTED_PHONE_NUMBER
- SIGNAL_RECIPIENTS=REDACTED_PHONE_NUMBER
configs:
- source: signal_bridge_app
target: /app/app.py
command: >
sh -c "pip install --quiet flask requests gunicorn &&
cd /app && gunicorn --bind 0.0.0.0:5000 --workers 2 app:app"
networks:
- alerting
- signal-api-stack_default
healthcheck:
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health')"]
interval: 30s
timeout: 10s
retries: 3
volumes:
alertmanager-data:
networks:
alerting:
driver: bridge
monitoring-stack_monitoring:
external: true
ntfy-stack_default:
external: true
signal-api-stack_default:
external: true