Sanitized mirror from private repository - 2026-03-20 09:49:40 UTC
Some checks failed
Documentation / Build Docusaurus (push) Failing after 18m5s
Documentation / Deploy to GitHub Pages (push) Has been skipped

This commit is contained in:
Gitea Mirror Bot
2026-03-20 09:49:40 +00:00
commit 3cb5034cc6
1231 changed files with 305915 additions and 0 deletions

602
docs/admin/monitoring.md Normal file
View File

@@ -0,0 +1,602 @@
# 📊 Monitoring & Observability Guide
## Overview
This guide covers the complete monitoring stack for the homelab, including metrics collection, visualization, alerting, and log management.
---
## 🏗️ Monitoring Architecture
```
┌─────────────────────────────────────────────────────────────────────────────┐
│ MONITORING STACK │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Prometheus │◄───│ Node │ │ SNMP │ │ cAdvisor │ │
│ │ (Metrics) │ │ Exporter │ │ Exporter │ │ (Containers)│ │
│ └──────┬──────┘ └─────────────┘ └─────────────┘ └─────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────┐ ┌─────────────┐ │
│ │ Grafana │ │ Alertmanager│──► ntfy / Signal / Email │
│ │ (Dashboard) │ │ (Alerts) │ │
│ └─────────────┘ └─────────────┘ │
│ │
│ ┌─────────────┐ ┌─────────────┐ │
│ │ Uptime Kuma │ │ Dozzle │ │
│ │ (Status) │ │ (Logs) │ │
│ └─────────────┘ └─────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
```
---
## 🚀 Quick Setup
### Deploy Full Monitoring Stack
```yaml
# monitoring-stack.yaml
version: "3.8"
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./prometheus/rules:/etc/prometheus/rules
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
ports:
- "9090:9090"
restart: unless-stopped
grafana:
image: grafana/grafana:latest
container_name: grafana
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
environment:
- GF_SECURITY_ADMIN_PASSWORD="REDACTED_PASSWORD"
- GF_USERS_ALLOW_SIGN_UP=false
ports:
- "3000:3000"
restart: unless-stopped
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
ports:
- "9093:9093"
restart: unless-stopped
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
ports:
- "9100:9100"
restart: unless-stopped
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
container_name: cadvisor
privileged: true
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
ports:
- "8080:8080"
restart: unless-stopped
volumes:
prometheus_data:
grafana_data:
```
---
## 📈 Prometheus Configuration
### Main Configuration
```yaml
# prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files:
- /etc/prometheus/rules/*.yml
scrape_configs:
# Prometheus self-monitoring
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Node exporters (Linux hosts)
- job_name: 'node'
static_configs:
- targets:
- 'node-exporter:9100'
- 'homelab-vm:9100'
- 'guava:9100'
- 'anubis:9100'
# Synology NAS via SNMP
- job_name: 'synology'
static_configs:
- targets:
- 'atlantis:9116'
- 'calypso:9116'
- 'setillo:9116'
metrics_path: /snmp
params:
module: [synology]
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: snmp-exporter:9116
# Docker containers via cAdvisor
- job_name: 'cadvisor'
static_configs:
- targets:
- 'cadvisor:8080'
- 'atlantis:8080'
- 'calypso:8080'
# Blackbox exporter for HTTP probes
- job_name: 'blackbox'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://plex.vish.gg
- https://immich.vish.gg
- https://vault.vish.gg
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# Watchtower metrics
- job_name: 'watchtower'
bearer_token: "REDACTED_TOKEN"
static_configs:
- targets:
- 'atlantis:8080'
- 'calypso:8080'
```
### Alert Rules
```yaml
# prometheus/rules/alerts.yml
groups:
- name: infrastructure
rules:
# Host down
- alert: HostDown
expr: up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Host {{ $labels.instance }} is down"
description: "{{ $labels.instance }} has been unreachable for 2 minutes."
# High CPU
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU load on {{ $labels.instance }}"
description: "CPU load is {{ $value | printf \"%.2f\" }}%"
# Low memory
- alert: HostOutOfMemory
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Host out of memory: {{ $labels.instance }}"
description: "Memory usage is {{ $value | printf \"%.2f\" }}%"
# Disk space
- alert: HostOutOfDiskSpace
expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space low on {{ $labels.instance }}"
description: "Disk usage is {{ $value | printf \"%.2f\" }}% on {{ $labels.mountpoint }}"
# Disk will fill
- alert: HostDiskWillFillIn24Hours
expr: predict_linear(node_filesystem_avail_bytes{fstype!="tmpfs"}[6h], 24*60*60) < 0
for: 1h
labels:
severity: warning
annotations:
summary: "Disk will fill in 24 hours on {{ $labels.instance }}"
- name: containers
rules:
# Container down
- alert: ContainerDown
expr: absent(container_last_seen{name=~".+"})
for: 5m
labels:
severity: warning
annotations:
summary: "Container {{ $labels.name }} is down"
# Container high CPU
- alert: REDACTED_APP_PASSWORD
expr: (sum by(name) (rate(container_cpu_usage_seconds_total[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container {{ $labels.name }} high CPU"
description: "CPU usage is {{ $value | printf \"%.2f\" }}%"
# Container high memory
- alert: ContainerHighMemory
expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Container {{ $labels.name }} high memory"
- name: services
rules:
# SSL certificate expiring
- alert: SSLCertificateExpiringSoon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 14
for: 1h
labels:
severity: warning
annotations:
summary: "SSL certificate expiring soon for {{ $labels.instance }}"
description: "Certificate expires in {{ $value | REDACTED_APP_PASSWORD }}"
# HTTP probe failed
- alert: ServiceDown
expr: probe_success == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.instance }} is down"
```
---
## 🔔 Alertmanager Configuration
### Basic Setup with ntfy
```yaml
# alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'severity']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: 'ntfy'
routes:
# Critical alerts - immediate
- match:
severity: critical
receiver: 'ntfy-critical'
repeat_interval: 1h
# Warning alerts
- match:
severity: warning
receiver: 'ntfy'
repeat_interval: 4h
receivers:
- name: 'ntfy'
webhook_configs:
- url: 'http://ntfy:80/homelab-alerts'
send_resolved: true
- name: 'ntfy-critical'
webhook_configs:
- url: 'http://ntfy:80/homelab-critical'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
```
### ntfy Integration Script
```python
#!/usr/bin/env python3
# alertmanager-ntfy-bridge.py
from flask import Flask, request
import requests
import json
app = Flask(__name__)
NTFY_URL = "http://ntfy:80"
@app.route('/webhook', methods=['POST'])
def webhook():
data = request.json
for alert in data.get('alerts', []):
status = alert['status']
labels = alert['labels']
annotations = alert.get('annotations', {})
title = f"[{status.upper()}] {labels.get('alertname', 'Alert')}"
message = annotations.get('description', annotations.get('summary', 'No description'))
priority = "high" if labels.get('severity') == 'critical' else "default"
requests.post(
f"{NTFY_URL}/homelab-alerts",
headers={
"Title": title,
"Priority": priority,
"Tags": "warning" if status == "firing" else "white_check_mark"
},
data=message
)
return "OK", 200
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
```
---
## 📊 Grafana Dashboards
### Essential Dashboards
| Dashboard | ID | Description |
|-----------|-----|-------------|
| Node Exporter Full | 1860 | Complete Linux host metrics |
| Docker Containers | 893 | Container resource usage |
| Synology NAS | 14284 | Synology SNMP metrics |
| Blackbox Exporter | 7587 | HTTP/ICMP probe results |
| Prometheus Stats | 3662 | Prometheus self-monitoring |
### Import Dashboards
```bash
# Via Grafana API
curl -X POST \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $GRAFANA_API_KEY" \
-d '{
"dashboard": {"id": null, "title": "Node Exporter Full"},
"folderId": 0,
"overwrite": true,
"inputs": [{"name": "DS_PROMETHEUS", "type": "datasource", "value": "Prometheus"}]
}' \
http://localhost:3000/api/dashboards/import
```
### Custom Dashboard: Homelab Overview
```json
{
"title": "Homelab Overview",
"panels": [
{
"title": "Active Hosts",
"type": "stat",
"targets": [{"expr": "count(up == 1)"}]
},
{
"title": "Running Containers",
"type": "stat",
"targets": [{"expr": "count(container_last_seen)"}]
},
{
"title": "Total Storage Used",
"type": "gauge",
"targets": [{"expr": "sum(node_filesystem_size_bytes{fstype!='tmpfs'} - node_filesystem_avail_bytes{fstype!='tmpfs'})"}]
},
{
"title": "Network Traffic",
"type": "timeseries",
"targets": [
{"expr": "sum(rate(node_network_receive_bytes_total[5m]))", "legendFormat": "Received"},
{"expr": "sum(rate(node_network_transmit_bytes_total[5m]))", "legendFormat": "Transmitted"}
]
}
]
}
```
---
## 🔍 Uptime Kuma Setup
### Deploy Uptime Kuma
```yaml
# uptime-kuma.yaml
version: "3.8"
services:
uptime-kuma:
image: louislam/uptime-kuma:latest
container_name: uptime-kuma
volumes:
- uptime-kuma:/app/data
ports:
- "3001:3001"
restart: unless-stopped
volumes:
uptime-kuma:
```
### Recommended Monitors
| Service | Type | URL/Target | Interval |
|---------|------|------------|----------|
| Plex | HTTP | https://plex.vish.gg | 60s |
| Immich | HTTP | https://immich.vish.gg | 60s |
| Vaultwarden | HTTP | https://vault.vish.gg | 60s |
| Atlantis SSH | TCP Port | atlantis:22 | 120s |
| Pi-hole DNS | DNS | pihole:53 | 60s |
| Grafana | HTTP | http://grafana:3000 | 60s |
### Status Page Setup
```bash
# Create public status page
# Uptime Kuma > Status Pages > Add
# Add relevant monitors
# Share URL: https://status.vish.gg
```
---
## 📜 Log Management with Dozzle
### Deploy Dozzle
```yaml
# dozzle.yaml
version: "3.8"
services:
dozzle:
image: amir20/dozzle:latest
container_name: dozzle
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
ports:
- "8888:8080"
environment:
- DOZZLE_AUTH_PROVIDER=simple
- DOZZLE_USERNAME=admin
- DOZZLE_PASSWORD="REDACTED_PASSWORD"
restart: unless-stopped
```
### Multi-Host Log Aggregation
```yaml
# For monitoring multiple Docker hosts
# Deploy Dozzle agent on each host:
# dozzle-agent.yaml (on remote hosts)
version: "3.8"
services:
dozzle-agent:
image: amir20/dozzle:latest
container_name: dozzle-agent
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
command: agent
environment:
- DOZZLE_REMOTE_HOST=tcp://main-dozzle:7007
restart: unless-stopped
```
---
## 📱 Mobile Monitoring
### ntfy Mobile App
1. Install ntfy app (iOS/Android)
2. Subscribe to topics:
- `homelab-alerts` - All alerts
- `homelab-critical` - Critical only
3. Configure notification settings per topic
### Grafana Mobile
1. Access Grafana via Tailscale: `http://grafana.tailnet:3000`
2. Or expose via reverse proxy with authentication
3. Create mobile-optimized dashboards
---
## 🔧 Maintenance Tasks
### Weekly
- [ ] Review alert history for false positives
- [ ] Check disk space on Prometheus data directory
- [ ] Verify all scraped targets are healthy
### Monthly
- [ ] Update Grafana dashboards
- [ ] Review and tune alert thresholds
- [ ] Clean up old Prometheus data if needed
- [ ] Test alerting pipeline
### Quarterly
- [ ] Review monitoring coverage
- [ ] Add monitors for new services
- [ ] Update documentation
---
## 🔗 Related Documentation
- [Performance Troubleshooting](../troubleshooting/performance.md)
- [Alerting Setup](alerting-setup.md)
- [Service Architecture](../diagrams/service-architecture.md)
- [Common Issues](../troubleshooting/common-issues.md)