Sanitized mirror from private repository - 2026-04-18 11:19:59 UTC
This commit is contained in:
284
hosts/vms/homelab-vm/alerting.yaml
Normal file
284
hosts/vms/homelab-vm/alerting.yaml
Normal file
@@ -0,0 +1,284 @@
|
||||
# Alerting Stack - Alertmanager + Notification Bridges
|
||||
# =============================================================================
|
||||
# Dual-channel alerting: ntfy (mobile push) + Signal (encrypted messaging)
|
||||
# =============================================================================
|
||||
# Deployed via: Portainer GitOps
|
||||
# Ports: 9093 (Alertmanager), 5000 (signal-bridge), 5001 (ntfy-bridge)
|
||||
#
|
||||
# Alert Routing:
|
||||
# - Warning alerts → ntfy only
|
||||
# - Critical alerts → ntfy + Signal
|
||||
# - Resolved alerts → Both channels (for critical)
|
||||
#
|
||||
# Uses docker configs to embed Python bridge apps since Portainer GitOps
|
||||
# doesn't support docker build
|
||||
|
||||
configs:
|
||||
# Alertmanager Configuration
|
||||
alertmanager_config:
|
||||
content: |
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
group_by: ['alertname', 'severity', 'instance']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
receiver: 'ntfy-all'
|
||||
|
||||
routes:
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'critical-alerts'
|
||||
continue: false
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'ntfy-all'
|
||||
|
||||
receivers:
|
||||
- name: 'ntfy-all'
|
||||
webhook_configs:
|
||||
- url: 'http://ntfy-bridge:5001/alert'
|
||||
send_resolved: true
|
||||
|
||||
- name: 'critical-alerts'
|
||||
webhook_configs:
|
||||
- url: 'http://ntfy-bridge:5001/alert'
|
||||
send_resolved: true
|
||||
- url: 'http://signal-bridge:5000/alert'
|
||||
send_resolved: true
|
||||
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'instance']
|
||||
|
||||
# ntfy-bridge Python App
|
||||
ntfy_bridge_app:
|
||||
content: |
|
||||
from flask import Flask, request, jsonify
|
||||
import requests
|
||||
import os
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
NTFY_URL = os.environ.get('NTFY_URL', 'http://NTFY:80')
|
||||
NTFY_TOPIC = os.environ.get('NTFY_TOPIC', 'homelab-alerts')
|
||||
|
||||
def get_priority(severity, status):
|
||||
if status == 'resolved':
|
||||
return '3'
|
||||
if severity == 'critical':
|
||||
return '5'
|
||||
return '4'
|
||||
|
||||
def get_tag(severity, status):
|
||||
if status == 'resolved':
|
||||
return 'white_check_mark'
|
||||
if severity == 'critical':
|
||||
return 'rotating_light'
|
||||
return 'warning'
|
||||
|
||||
def format_alert(alert):
|
||||
status = alert.get('status', 'firing')
|
||||
labels = alert.get('labels', {})
|
||||
annotations = alert.get('annotations', {})
|
||||
|
||||
alertname = labels.get('alertname', 'Unknown')
|
||||
severity = labels.get('severity', 'warning')
|
||||
instance = labels.get('instance', 'unknown')
|
||||
|
||||
status_text = 'RESOLVED' if status == 'resolved' else 'FIRING'
|
||||
title = f"{alertname} [{status_text}]"
|
||||
|
||||
summary = annotations.get('summary', '')
|
||||
description = annotations.get('description', '')
|
||||
|
||||
body_parts = []
|
||||
if summary:
|
||||
body_parts.append(summary)
|
||||
if description and description != summary:
|
||||
body_parts.append(description)
|
||||
if instance != 'unknown':
|
||||
body_parts.append(f"Host: {instance}")
|
||||
|
||||
body = '\n'.join(body_parts) if body_parts else f"Alert {status_text.lower()}"
|
||||
return title, body, severity, status
|
||||
|
||||
@app.route('/alert', methods=['POST'])
|
||||
def handle_alert():
|
||||
try:
|
||||
data = request.json
|
||||
for alert in data.get('alerts', []):
|
||||
title, body, severity, status = format_alert(alert)
|
||||
requests.post(f"{NTFY_URL}/{NTFY_TOPIC}", data=body,
|
||||
headers={'Title': title, 'Priority': get_priority(severity, status), 'Tags': get_tag(severity, status)})
|
||||
return jsonify({'status': 'sent', 'count': len(data.get('alerts', []))})
|
||||
except Exception as e:
|
||||
return jsonify({'status': 'error', 'message': str(e)}), 500
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
def health():
|
||||
return jsonify({'status': 'healthy'})
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=5001)
|
||||
|
||||
# signal-bridge Python App
|
||||
signal_bridge_app:
|
||||
content: |
|
||||
import os
|
||||
import requests
|
||||
from flask import Flask, request, jsonify
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
SIGNAL_API_URL = os.environ.get('SIGNAL_API_URL', 'http://signal-api:8080')
|
||||
SIGNAL_SENDER = os.environ.get('SIGNAL_SENDER', '')
|
||||
SIGNAL_RECIPIENTS = os.environ.get('SIGNAL_RECIPIENTS', '').split(',')
|
||||
|
||||
def format_alert_message(alert_data):
|
||||
messages = []
|
||||
for alert in alert_data.get('alerts', []):
|
||||
status = alert.get('status', 'firing')
|
||||
labels = alert.get('labels', {})
|
||||
annotations = alert.get('annotations', {})
|
||||
severity = labels.get('severity', 'warning')
|
||||
summary = annotations.get('summary', labels.get('alertname', 'Alert'))
|
||||
description = annotations.get('description', '')
|
||||
|
||||
if status == 'resolved':
|
||||
emoji, text = '✅', 'RESOLVED'
|
||||
elif severity == 'critical':
|
||||
emoji, text = '🚨', 'CRITICAL'
|
||||
else:
|
||||
emoji, text = '⚠️', 'WARNING'
|
||||
|
||||
msg = f"{emoji} [{text}] {summary}"
|
||||
if description:
|
||||
msg += f"\n{description}"
|
||||
messages.append(msg)
|
||||
return "\n\n".join(messages)
|
||||
|
||||
def send_signal_message(message):
|
||||
if not SIGNAL_SENDER or not SIGNAL_RECIPIENTS:
|
||||
return False
|
||||
success = True
|
||||
for recipient in SIGNAL_RECIPIENTS:
|
||||
recipient = recipient.strip()
|
||||
if not recipient:
|
||||
continue
|
||||
try:
|
||||
response = requests.post(f"{SIGNAL_API_URL}/v2/send", json={
|
||||
"message": message, "number": SIGNAL_SENDER, "recipients": [recipient]
|
||||
}, timeout=30)
|
||||
if response.status_code not in [200, 201]:
|
||||
success = False
|
||||
except Exception:
|
||||
success = False
|
||||
return success
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
def health():
|
||||
return jsonify({"status": "healthy"})
|
||||
|
||||
@app.route('/alert', methods=['POST'])
|
||||
def receive_alert():
|
||||
try:
|
||||
alert_data = request.get_json()
|
||||
if not alert_data:
|
||||
return jsonify({"error": "No data"}), 400
|
||||
message = format_alert_message(alert_data)
|
||||
if send_signal_message(message):
|
||||
return jsonify({"status": "sent"})
|
||||
return jsonify({"status": "partial_failure"}), 207
|
||||
except Exception as e:
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=5000)
|
||||
|
||||
services:
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
container_name: alertmanager
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "9093:9093"
|
||||
configs:
|
||||
- source: alertmanager_config
|
||||
target: /etc/alertmanager/alertmanager.yml
|
||||
volumes:
|
||||
- alertmanager-data:/alertmanager
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
- '--web.external-url=http://localhost:9093'
|
||||
networks:
|
||||
- alerting
|
||||
- monitoring-stack_monitoring
|
||||
|
||||
ntfy-bridge:
|
||||
image: python:3.11-slim
|
||||
container_name: ntfy-bridge
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "5001:5001"
|
||||
environment:
|
||||
- NTFY_URL=http://NTFY:80
|
||||
- NTFY_TOPIC="REDACTED_NTFY_TOPIC"
|
||||
configs:
|
||||
- source: ntfy_bridge_app
|
||||
target: /app/app.py
|
||||
command: >
|
||||
sh -c "pip install --quiet flask requests gunicorn &&
|
||||
cd /app && gunicorn --bind 0.0.0.0:5001 --workers 2 app:app"
|
||||
networks:
|
||||
- alerting
|
||||
- ntfy-stack_default
|
||||
healthcheck:
|
||||
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
signal-bridge:
|
||||
image: python:3.11-slim
|
||||
container_name: signal-bridge
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "5000:5000"
|
||||
environment:
|
||||
- SIGNAL_API_URL=http://signal-api:8080
|
||||
- SIGNAL_SENDER=REDACTED_PHONE_NUMBER
|
||||
- SIGNAL_RECIPIENTS=REDACTED_PHONE_NUMBER
|
||||
configs:
|
||||
- source: signal_bridge_app
|
||||
target: /app/app.py
|
||||
command: >
|
||||
sh -c "pip install --quiet flask requests gunicorn &&
|
||||
cd /app && gunicorn --bind 0.0.0.0:5000 --workers 2 app:app"
|
||||
networks:
|
||||
- alerting
|
||||
- signal-api-stack_default
|
||||
healthcheck:
|
||||
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health')"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
volumes:
|
||||
alertmanager-data:
|
||||
|
||||
networks:
|
||||
alerting:
|
||||
driver: bridge
|
||||
monitoring-stack_monitoring:
|
||||
external: true
|
||||
ntfy-stack_default:
|
||||
external: true
|
||||
signal-api-stack_default:
|
||||
external: true
|
||||
Reference in New Issue
Block a user