603 lines
16 KiB
Markdown
603 lines
16 KiB
Markdown
# 📊 Monitoring & Observability Guide
|
|
|
|
## Overview
|
|
|
|
This guide covers the complete monitoring stack for the homelab, including metrics collection, visualization, alerting, and log management.
|
|
|
|
---
|
|
|
|
## 🏗️ Monitoring Architecture
|
|
|
|
```
|
|
┌─────────────────────────────────────────────────────────────────────────────┐
|
|
│ MONITORING STACK │
|
|
├─────────────────────────────────────────────────────────────────────────────┤
|
|
│ │
|
|
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
|
│ │ Prometheus │◄───│ Node │ │ SNMP │ │ cAdvisor │ │
|
|
│ │ (Metrics) │ │ Exporter │ │ Exporter │ │ (Containers)│ │
|
|
│ └──────┬──────┘ └─────────────┘ └─────────────┘ └─────────────┘ │
|
|
│ │ │
|
|
│ ▼ │
|
|
│ ┌─────────────┐ ┌─────────────┐ │
|
|
│ │ Grafana │ │ Alertmanager│──► ntfy / Signal / Email │
|
|
│ │ (Dashboard) │ │ (Alerts) │ │
|
|
│ └─────────────┘ └─────────────┘ │
|
|
│ │
|
|
│ ┌─────────────┐ ┌─────────────┐ │
|
|
│ │ Uptime Kuma │ │ Dozzle │ │
|
|
│ │ (Status) │ │ (Logs) │ │
|
|
│ └─────────────┘ └─────────────┘ │
|
|
│ │
|
|
└─────────────────────────────────────────────────────────────────────────────┘
|
|
```
|
|
|
|
---
|
|
|
|
## 🚀 Quick Setup
|
|
|
|
### Deploy Full Monitoring Stack
|
|
|
|
```yaml
|
|
# monitoring-stack.yaml
|
|
version: "3.8"
|
|
|
|
services:
|
|
prometheus:
|
|
image: prom/prometheus:latest
|
|
container_name: prometheus
|
|
volumes:
|
|
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
|
|
- ./prometheus/rules:/etc/prometheus/rules
|
|
- prometheus_data:/prometheus
|
|
command:
|
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
|
- '--storage.tsdb.path=/prometheus'
|
|
- '--storage.tsdb.retention.time=30d'
|
|
- '--web.enable-lifecycle'
|
|
ports:
|
|
- "9090:9090"
|
|
restart: unless-stopped
|
|
|
|
grafana:
|
|
image: grafana/grafana:latest
|
|
container_name: grafana
|
|
volumes:
|
|
- grafana_data:/var/lib/grafana
|
|
- ./grafana/provisioning:/etc/grafana/provisioning
|
|
environment:
|
|
- GF_SECURITY_ADMIN_PASSWORD="REDACTED_PASSWORD"
|
|
- GF_USERS_ALLOW_SIGN_UP=false
|
|
ports:
|
|
- "3000:3000"
|
|
restart: unless-stopped
|
|
|
|
alertmanager:
|
|
image: prom/alertmanager:latest
|
|
container_name: alertmanager
|
|
volumes:
|
|
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
|
|
ports:
|
|
- "9093:9093"
|
|
restart: unless-stopped
|
|
|
|
node-exporter:
|
|
image: prom/node-exporter:latest
|
|
container_name: node-exporter
|
|
volumes:
|
|
- /proc:/host/proc:ro
|
|
- /sys:/host/sys:ro
|
|
- /:/rootfs:ro
|
|
command:
|
|
- '--path.procfs=/host/proc'
|
|
- '--path.sysfs=/host/sys'
|
|
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
|
ports:
|
|
- "9100:9100"
|
|
restart: unless-stopped
|
|
|
|
cadvisor:
|
|
image: gcr.io/cadvisor/cadvisor:latest
|
|
container_name: cadvisor
|
|
privileged: true
|
|
volumes:
|
|
- /:/rootfs:ro
|
|
- /var/run:/var/run:ro
|
|
- /sys:/sys:ro
|
|
- /var/lib/docker/:/var/lib/docker:ro
|
|
ports:
|
|
- "8080:8080"
|
|
restart: unless-stopped
|
|
|
|
volumes:
|
|
prometheus_data:
|
|
grafana_data:
|
|
```
|
|
|
|
---
|
|
|
|
## 📈 Prometheus Configuration
|
|
|
|
### Main Configuration
|
|
|
|
```yaml
|
|
# prometheus/prometheus.yml
|
|
global:
|
|
scrape_interval: 15s
|
|
evaluation_interval: 15s
|
|
|
|
alerting:
|
|
alertmanagers:
|
|
- static_configs:
|
|
- targets:
|
|
- alertmanager:9093
|
|
|
|
rule_files:
|
|
- /etc/prometheus/rules/*.yml
|
|
|
|
scrape_configs:
|
|
# Prometheus self-monitoring
|
|
- job_name: 'prometheus'
|
|
static_configs:
|
|
- targets: ['localhost:9090']
|
|
|
|
# Node exporters (Linux hosts)
|
|
- job_name: 'node'
|
|
static_configs:
|
|
- targets:
|
|
- 'node-exporter:9100'
|
|
- 'homelab-vm:9100'
|
|
- 'guava:9100'
|
|
- 'anubis:9100'
|
|
|
|
# Synology NAS via SNMP
|
|
- job_name: 'synology'
|
|
static_configs:
|
|
- targets:
|
|
- 'atlantis:9116'
|
|
- 'calypso:9116'
|
|
- 'setillo:9116'
|
|
metrics_path: /snmp
|
|
params:
|
|
module: [synology]
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: snmp-exporter:9116
|
|
|
|
# Docker containers via cAdvisor
|
|
- job_name: 'cadvisor'
|
|
static_configs:
|
|
- targets:
|
|
- 'cadvisor:8080'
|
|
- 'atlantis:8080'
|
|
- 'calypso:8080'
|
|
|
|
# Blackbox exporter for HTTP probes
|
|
- job_name: 'blackbox'
|
|
metrics_path: /probe
|
|
params:
|
|
module: [http_2xx]
|
|
static_configs:
|
|
- targets:
|
|
- https://plex.vish.gg
|
|
- https://immich.vish.gg
|
|
- https://vault.vish.gg
|
|
relabel_configs:
|
|
- source_labels: [__address__]
|
|
target_label: __param_target
|
|
- source_labels: [__param_target]
|
|
target_label: instance
|
|
- target_label: __address__
|
|
replacement: blackbox-exporter:9115
|
|
|
|
# Watchtower metrics
|
|
- job_name: 'watchtower'
|
|
bearer_token: "REDACTED_TOKEN"
|
|
static_configs:
|
|
- targets:
|
|
- 'atlantis:8080'
|
|
- 'calypso:8080'
|
|
```
|
|
|
|
### Alert Rules
|
|
|
|
```yaml
|
|
# prometheus/rules/alerts.yml
|
|
groups:
|
|
- name: infrastructure
|
|
rules:
|
|
# Host down
|
|
- alert: HostDown
|
|
expr: up == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Host {{ $labels.instance }} is down"
|
|
description: "{{ $labels.instance }} has been unreachable for 2 minutes."
|
|
|
|
# High CPU
|
|
- alert: HostHighCpuLoad
|
|
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU load on {{ $labels.instance }}"
|
|
description: "CPU load is {{ $value | printf \"%.2f\" }}%"
|
|
|
|
# Low memory
|
|
- alert: HostOutOfMemory
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Host out of memory: {{ $labels.instance }}"
|
|
description: "Memory usage is {{ $value | printf \"%.2f\" }}%"
|
|
|
|
# Disk space
|
|
- alert: HostOutOfDiskSpace
|
|
expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk space low on {{ $labels.instance }}"
|
|
description: "Disk usage is {{ $value | printf \"%.2f\" }}% on {{ $labels.mountpoint }}"
|
|
|
|
# Disk will fill
|
|
- alert: HostDiskWillFillIn24Hours
|
|
expr: predict_linear(node_filesystem_avail_bytes{fstype!="tmpfs"}[6h], 24*60*60) < 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk will fill in 24 hours on {{ $labels.instance }}"
|
|
|
|
- name: containers
|
|
rules:
|
|
# Container down
|
|
- alert: ContainerDown
|
|
expr: absent(container_last_seen{name=~".+"})
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container {{ $labels.name }} is down"
|
|
|
|
# Container high CPU
|
|
- alert: REDACTED_APP_PASSWORD
|
|
expr: (sum by(name) (rate(container_cpu_usage_seconds_total[5m])) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container {{ $labels.name }} high CPU"
|
|
description: "CPU usage is {{ $value | printf \"%.2f\" }}%"
|
|
|
|
# Container high memory
|
|
- alert: ContainerHighMemory
|
|
expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) * 100 > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container {{ $labels.name }} high memory"
|
|
|
|
- name: services
|
|
rules:
|
|
# SSL certificate expiring
|
|
- alert: SSLCertificateExpiringSoon
|
|
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 14
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "SSL certificate expiring soon for {{ $labels.instance }}"
|
|
description: "Certificate expires in {{ $value | REDACTED_APP_PASSWORD }}"
|
|
|
|
# HTTP probe failed
|
|
- alert: ServiceDown
|
|
expr: probe_success == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Service {{ $labels.instance }} is down"
|
|
```
|
|
|
|
---
|
|
|
|
## 🔔 Alertmanager Configuration
|
|
|
|
### Basic Setup with ntfy
|
|
|
|
```yaml
|
|
# alertmanager/alertmanager.yml
|
|
global:
|
|
resolve_timeout: 5m
|
|
|
|
route:
|
|
group_by: ['alertname', 'severity']
|
|
group_wait: 30s
|
|
group_interval: 5m
|
|
repeat_interval: 4h
|
|
receiver: 'ntfy'
|
|
|
|
routes:
|
|
# Critical alerts - immediate
|
|
- match:
|
|
severity: critical
|
|
receiver: 'ntfy-critical'
|
|
repeat_interval: 1h
|
|
|
|
# Warning alerts
|
|
- match:
|
|
severity: warning
|
|
receiver: 'ntfy'
|
|
repeat_interval: 4h
|
|
|
|
receivers:
|
|
- name: 'ntfy'
|
|
webhook_configs:
|
|
- url: 'http://ntfy:80/homelab-alerts'
|
|
send_resolved: true
|
|
|
|
- name: 'ntfy-critical'
|
|
webhook_configs:
|
|
- url: 'http://ntfy:80/homelab-critical'
|
|
send_resolved: true
|
|
|
|
inhibit_rules:
|
|
- source_match:
|
|
severity: 'critical'
|
|
target_match:
|
|
severity: 'warning'
|
|
equal: ['alertname', 'instance']
|
|
```
|
|
|
|
### ntfy Integration Script
|
|
|
|
```python
|
|
#!/usr/bin/env python3
|
|
# alertmanager-ntfy-bridge.py
|
|
from flask import Flask, request
|
|
import requests
|
|
import json
|
|
|
|
app = Flask(__name__)
|
|
|
|
NTFY_URL = "http://ntfy:80"
|
|
|
|
@app.route('/webhook', methods=['POST'])
|
|
def webhook():
|
|
data = request.json
|
|
|
|
for alert in data.get('alerts', []):
|
|
status = alert['status']
|
|
labels = alert['labels']
|
|
annotations = alert.get('annotations', {})
|
|
|
|
title = f"[{status.upper()}] {labels.get('alertname', 'Alert')}"
|
|
message = annotations.get('description', annotations.get('summary', 'No description'))
|
|
|
|
priority = "high" if labels.get('severity') == 'critical' else "default"
|
|
|
|
requests.post(
|
|
f"{NTFY_URL}/homelab-alerts",
|
|
headers={
|
|
"Title": title,
|
|
"Priority": priority,
|
|
"Tags": "warning" if status == "firing" else "white_check_mark"
|
|
},
|
|
data=message
|
|
)
|
|
|
|
return "OK", 200
|
|
|
|
if __name__ == '__main__':
|
|
app.run(host='0.0.0.0', port=5000)
|
|
```
|
|
|
|
---
|
|
|
|
## 📊 Grafana Dashboards
|
|
|
|
### Essential Dashboards
|
|
|
|
| Dashboard | ID | Description |
|
|
|-----------|-----|-------------|
|
|
| Node Exporter Full | 1860 | Complete Linux host metrics |
|
|
| Docker Containers | 893 | Container resource usage |
|
|
| Synology NAS | 14284 | Synology SNMP metrics |
|
|
| Blackbox Exporter | 7587 | HTTP/ICMP probe results |
|
|
| Prometheus Stats | 3662 | Prometheus self-monitoring |
|
|
|
|
### Import Dashboards
|
|
|
|
```bash
|
|
# Via Grafana API
|
|
curl -X POST \
|
|
-H "Content-Type: application/json" \
|
|
-H "Authorization: Bearer $GRAFANA_API_KEY" \
|
|
-d '{
|
|
"dashboard": {"id": null, "title": "Node Exporter Full"},
|
|
"folderId": 0,
|
|
"overwrite": true,
|
|
"inputs": [{"name": "DS_PROMETHEUS", "type": "datasource", "value": "Prometheus"}]
|
|
}' \
|
|
http://localhost:3000/api/dashboards/import
|
|
```
|
|
|
|
### Custom Dashboard: Homelab Overview
|
|
|
|
```json
|
|
{
|
|
"title": "Homelab Overview",
|
|
"panels": [
|
|
{
|
|
"title": "Active Hosts",
|
|
"type": "stat",
|
|
"targets": [{"expr": "count(up == 1)"}]
|
|
},
|
|
{
|
|
"title": "Running Containers",
|
|
"type": "stat",
|
|
"targets": [{"expr": "count(container_last_seen)"}]
|
|
},
|
|
{
|
|
"title": "Total Storage Used",
|
|
"type": "gauge",
|
|
"targets": [{"expr": "sum(node_filesystem_size_bytes{fstype!='tmpfs'} - node_filesystem_avail_bytes{fstype!='tmpfs'})"}]
|
|
},
|
|
{
|
|
"title": "Network Traffic",
|
|
"type": "timeseries",
|
|
"targets": [
|
|
{"expr": "sum(rate(node_network_receive_bytes_total[5m]))", "legendFormat": "Received"},
|
|
{"expr": "sum(rate(node_network_transmit_bytes_total[5m]))", "legendFormat": "Transmitted"}
|
|
]
|
|
}
|
|
]
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## 🔍 Uptime Kuma Setup
|
|
|
|
### Deploy Uptime Kuma
|
|
|
|
```yaml
|
|
# uptime-kuma.yaml
|
|
version: "3.8"
|
|
services:
|
|
uptime-kuma:
|
|
image: louislam/uptime-kuma:latest
|
|
container_name: uptime-kuma
|
|
volumes:
|
|
- uptime-kuma:/app/data
|
|
ports:
|
|
- "3001:3001"
|
|
restart: unless-stopped
|
|
|
|
volumes:
|
|
uptime-kuma:
|
|
```
|
|
|
|
### Recommended Monitors
|
|
|
|
| Service | Type | URL/Target | Interval |
|
|
|---------|------|------------|----------|
|
|
| Plex | HTTP | https://plex.vish.gg | 60s |
|
|
| Immich | HTTP | https://immich.vish.gg | 60s |
|
|
| Vaultwarden | HTTP | https://vault.vish.gg | 60s |
|
|
| Atlantis SSH | TCP Port | atlantis:22 | 120s |
|
|
| Pi-hole DNS | DNS | pihole:53 | 60s |
|
|
| Grafana | HTTP | http://grafana:3000 | 60s |
|
|
|
|
### Status Page Setup
|
|
|
|
```bash
|
|
# Create public status page
|
|
# Uptime Kuma > Status Pages > Add
|
|
# Add relevant monitors
|
|
# Share URL: https://status.vish.gg
|
|
```
|
|
|
|
---
|
|
|
|
## 📜 Log Management with Dozzle
|
|
|
|
### Deploy Dozzle
|
|
|
|
```yaml
|
|
# dozzle.yaml
|
|
version: "3.8"
|
|
services:
|
|
dozzle:
|
|
image: amir20/dozzle:latest
|
|
container_name: dozzle
|
|
volumes:
|
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
|
ports:
|
|
- "8888:8080"
|
|
environment:
|
|
- DOZZLE_AUTH_PROVIDER=simple
|
|
- DOZZLE_USERNAME=admin
|
|
- DOZZLE_PASSWORD="REDACTED_PASSWORD"
|
|
restart: unless-stopped
|
|
```
|
|
|
|
### Multi-Host Log Aggregation
|
|
|
|
```yaml
|
|
# For monitoring multiple Docker hosts
|
|
# Deploy Dozzle agent on each host:
|
|
|
|
# dozzle-agent.yaml (on remote hosts)
|
|
version: "3.8"
|
|
services:
|
|
dozzle-agent:
|
|
image: amir20/dozzle:latest
|
|
container_name: dozzle-agent
|
|
volumes:
|
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
|
command: agent
|
|
environment:
|
|
- DOZZLE_REMOTE_HOST=tcp://main-dozzle:7007
|
|
restart: unless-stopped
|
|
```
|
|
|
|
---
|
|
|
|
## 📱 Mobile Monitoring
|
|
|
|
### ntfy Mobile App
|
|
|
|
1. Install ntfy app (iOS/Android)
|
|
2. Subscribe to topics:
|
|
- `homelab-alerts` - All alerts
|
|
- `homelab-critical` - Critical only
|
|
3. Configure notification settings per topic
|
|
|
|
### Grafana Mobile
|
|
|
|
1. Access Grafana via Tailscale: `http://grafana.tailnet:3000`
|
|
2. Or expose via reverse proxy with authentication
|
|
3. Create mobile-optimized dashboards
|
|
|
|
---
|
|
|
|
## 🔧 Maintenance Tasks
|
|
|
|
### Weekly
|
|
- [ ] Review alert history for false positives
|
|
- [ ] Check disk space on Prometheus data directory
|
|
- [ ] Verify all scraped targets are healthy
|
|
|
|
### Monthly
|
|
- [ ] Update Grafana dashboards
|
|
- [ ] Review and tune alert thresholds
|
|
- [ ] Clean up old Prometheus data if needed
|
|
- [ ] Test alerting pipeline
|
|
|
|
### Quarterly
|
|
- [ ] Review monitoring coverage
|
|
- [ ] Add monitors for new services
|
|
- [ ] Update documentation
|
|
|
|
---
|
|
|
|
## 🔗 Related Documentation
|
|
|
|
- [Performance Troubleshooting](../troubleshooting/performance.md)
|
|
- [Alerting Setup](alerting-setup.md)
|
|
- [Service Architecture](../diagrams/service-architecture.md)
|
|
- [Common Issues](../troubleshooting/common-issues.md)
|