Sanitized mirror from private repository - 2026-03-20 09:49:40 UTC

2026-03-20 09:49:40 +00:00
commit 3cb5034cc6
1231 changed files with 305915 additions and 0 deletions
--- a/docs/admin/monitoring.md
+++ b/docs/admin/monitoring.md
@@ -0,0 +1,602 @@
+# 📊 Monitoring & Observability Guide
+
+## Overview
+
+This guide covers the complete monitoring stack for the homelab, including metrics collection, visualization, alerting, and log management.
+
+---
+
+## 🏗️ Monitoring Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                        MONITORING STACK                                      │
+├─────────────────────────────────────────────────────────────────────────────┤
+│                                                                              │
+│   ┌─────────────┐    ┌─────────────┐    ┌─────────────┐    ┌─────────────┐ │
+│   │  Prometheus │◄───│   Node      │    │   SNMP      │    │  cAdvisor   │ │
+│   │  (Metrics)  │    │  Exporter   │    │  Exporter   │    │ (Containers)│ │
+│   └──────┬──────┘    └─────────────┘    └─────────────┘    └─────────────┘ │
+│          │                                                                   │
+│          ▼                                                                   │
+│   ┌─────────────┐    ┌─────────────┐                                        │
+│   │   Grafana   │    │ Alertmanager│──► ntfy / Signal / Email               │
+│   │ (Dashboard) │    │  (Alerts)   │                                        │
+│   └─────────────┘    └─────────────┘                                        │
+│                                                                              │
+│   ┌─────────────┐    ┌─────────────┐                                        │
+│   │ Uptime Kuma │    │   Dozzle    │                                        │
+│   │  (Status)   │    │   (Logs)    │                                        │
+│   └─────────────┘    └─────────────┘                                        │
+│                                                                              │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 🚀 Quick Setup
+
+### Deploy Full Monitoring Stack
+
+```yaml
+# monitoring-stack.yaml
+version: "3.8"
+
+services:
+  prometheus:
+    image: prom/prometheus:latest
+    container_name: prometheus
+    volumes:
+      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
+      - ./prometheus/rules:/etc/prometheus/rules
+      - prometheus_data:/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--storage.tsdb.retention.time=30d'
+      - '--web.enable-lifecycle'
+    ports:
+      - "9090:9090"
+    restart: unless-stopped
+
+  grafana:
+    image: grafana/grafana:latest
+    container_name: grafana
+    volumes:
+      - grafana_data:/var/lib/grafana
+      - ./grafana/provisioning:/etc/grafana/provisioning
+    environment:
+      - GF_SECURITY_ADMIN_PASSWORD="REDACTED_PASSWORD"
+      - GF_USERS_ALLOW_SIGN_UP=false
+    ports:
+      - "3000:3000"
+    restart: unless-stopped
+
+  alertmanager:
+    image: prom/alertmanager:latest
+    container_name: alertmanager
+    volumes:
+      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
+    ports:
+      - "9093:9093"
+    restart: unless-stopped
+
+  node-exporter:
+    image: prom/node-exporter:latest
+    container_name: node-exporter
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.sysfs=/host/sys'
+      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
+    ports:
+      - "9100:9100"
+    restart: unless-stopped
+
+  cadvisor:
+    image: gcr.io/cadvisor/cadvisor:latest
+    container_name: cadvisor
+    privileged: true
+    volumes:
+      - /:/rootfs:ro
+      - /var/run:/var/run:ro
+      - /sys:/sys:ro
+      - /var/lib/docker/:/var/lib/docker:ro
+    ports:
+      - "8080:8080"
+    restart: unless-stopped
+
+volumes:
+  prometheus_data:
+  grafana_data:
+```
+
+---
+
+## 📈 Prometheus Configuration
+
+### Main Configuration
+
+```yaml
+# prometheus/prometheus.yml
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+          - alertmanager:9093
+
+rule_files:
+  - /etc/prometheus/rules/*.yml
+
+scrape_configs:
+  # Prometheus self-monitoring
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
+
+  # Node exporters (Linux hosts)
+  - job_name: 'node'
+    static_configs:
+      - targets:
+        - 'node-exporter:9100'
+        - 'homelab-vm:9100'
+        - 'guava:9100'
+        - 'anubis:9100'
+
+  # Synology NAS via SNMP
+  - job_name: 'synology'
+    static_configs:
+      - targets:
+        - 'atlantis:9116'
+        - 'calypso:9116'
+        - 'setillo:9116'
+    metrics_path: /snmp
+    params:
+      module: [synology]
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: snmp-exporter:9116
+
+  # Docker containers via cAdvisor
+  - job_name: 'cadvisor'
+    static_configs:
+      - targets:
+        - 'cadvisor:8080'
+        - 'atlantis:8080'
+        - 'calypso:8080'
+
+  # Blackbox exporter for HTTP probes
+  - job_name: 'blackbox'
+    metrics_path: /probe
+    params:
+      module: [http_2xx]
+    static_configs:
+      - targets:
+        - https://plex.vish.gg
+        - https://immich.vish.gg
+        - https://vault.vish.gg
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: blackbox-exporter:9115
+
+  # Watchtower metrics
+  - job_name: 'watchtower'
+    bearer_token: "REDACTED_TOKEN"
+    static_configs:
+      - targets:
+        - 'atlantis:8080'
+        - 'calypso:8080'
+```
+
+### Alert Rules
+
+```yaml
+# prometheus/rules/alerts.yml
+groups:
+  - name: infrastructure
+    rules:
+      # Host down
+      - alert: HostDown
+        expr: up == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Host {{ $labels.instance }} is down"
+          description: "{{ $labels.instance }} has been unreachable for 2 minutes."
+
+      # High CPU
+      - alert: HostHighCpuLoad
+        expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High CPU load on {{ $labels.instance }}"
+          description: "CPU load is {{ $value | printf \"%.2f\" }}%"
+
+      # Low memory
+      - alert: HostOutOfMemory
+        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Host out of memory: {{ $labels.instance }}"
+          description: "Memory usage is {{ $value | printf \"%.2f\" }}%"
+
+      # Disk space
+      - alert: HostOutOfDiskSpace
+        expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 85
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Disk space low on {{ $labels.instance }}"
+          description: "Disk usage is {{ $value | printf \"%.2f\" }}% on {{ $labels.mountpoint }}"
+
+      # Disk will fill
+      - alert: HostDiskWillFillIn24Hours
+        expr: predict_linear(node_filesystem_avail_bytes{fstype!="tmpfs"}[6h], 24*60*60) < 0
+        for: 1h
+        labels:
+          severity: warning
+        annotations:
+          summary: "Disk will fill in 24 hours on {{ $labels.instance }}"
+
+  - name: containers
+    rules:
+      # Container down
+      - alert: ContainerDown
+        expr: absent(container_last_seen{name=~".+"})
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Container {{ $labels.name }} is down"
+
+      # Container high CPU
+      - alert: REDACTED_APP_PASSWORD
+        expr: (sum by(name) (rate(container_cpu_usage_seconds_total[5m])) * 100) > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Container {{ $labels.name }} high CPU"
+          description: "CPU usage is {{ $value | printf \"%.2f\" }}%"
+
+      # Container high memory
+      - alert: ContainerHighMemory
+        expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) * 100 > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Container {{ $labels.name }} high memory"
+
+  - name: services
+    rules:
+      # SSL certificate expiring
+      - alert: SSLCertificateExpiringSoon
+        expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 14
+        for: 1h
+        labels:
+          severity: warning
+        annotations:
+          summary: "SSL certificate expiring soon for {{ $labels.instance }}"
+          description: "Certificate expires in {{ $value | REDACTED_APP_PASSWORD }}"
+
+      # HTTP probe failed
+      - alert: ServiceDown
+        expr: probe_success == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Service {{ $labels.instance }} is down"
+```
+
+---
+
+## 🔔 Alertmanager Configuration
+
+### Basic Setup with ntfy
+
+```yaml
+# alertmanager/alertmanager.yml
+global:
+  resolve_timeout: 5m
+
+route:
+  group_by: ['alertname', 'severity']
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 4h
+  receiver: 'ntfy'
+  
+  routes:
+    # Critical alerts - immediate
+    - match:
+        severity: critical
+      receiver: 'ntfy-critical'
+      repeat_interval: 1h
+    
+    # Warning alerts
+    - match:
+        severity: warning
+      receiver: 'ntfy'
+      repeat_interval: 4h
+
+receivers:
+  - name: 'ntfy'
+    webhook_configs:
+      - url: 'http://ntfy:80/homelab-alerts'
+        send_resolved: true
+
+  - name: 'ntfy-critical'
+    webhook_configs:
+      - url: 'http://ntfy:80/homelab-critical'
+        send_resolved: true
+
+inhibit_rules:
+  - source_match:
+      severity: 'critical'
+    target_match:
+      severity: 'warning'
+    equal: ['alertname', 'instance']
+```
+
+### ntfy Integration Script
+
+```python
+#!/usr/bin/env python3
+# alertmanager-ntfy-bridge.py
+from flask import Flask, request
+import requests
+import json
+
+app = Flask(__name__)
+
+NTFY_URL = "http://ntfy:80"
+
+@app.route('/webhook', methods=['POST'])
+def webhook():
+    data = request.json
+    
+    for alert in data.get('alerts', []):
+        status = alert['status']
+        labels = alert['labels']
+        annotations = alert.get('annotations', {})
+        
+        title = f"[{status.upper()}] {labels.get('alertname', 'Alert')}"
+        message = annotations.get('description', annotations.get('summary', 'No description'))
+        
+        priority = "high" if labels.get('severity') == 'critical' else "default"
+        
+        requests.post(
+            f"{NTFY_URL}/homelab-alerts",
+            headers={
+                "Title": title,
+                "Priority": priority,
+                "Tags": "warning" if status == "firing" else "white_check_mark"
+            },
+            data=message
+        )
+    
+    return "OK", 200
+
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=5000)
+```
+
+---
+
+## 📊 Grafana Dashboards
+
+### Essential Dashboards
+
+| Dashboard | ID | Description |
+|-----------|-----|-------------|
+| Node Exporter Full | 1860 | Complete Linux host metrics |
+| Docker Containers | 893 | Container resource usage |
+| Synology NAS | 14284 | Synology SNMP metrics |
+| Blackbox Exporter | 7587 | HTTP/ICMP probe results |
+| Prometheus Stats | 3662 | Prometheus self-monitoring |
+
+### Import Dashboards
+
+```bash
+# Via Grafana API
+curl -X POST \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $GRAFANA_API_KEY" \
+  -d '{
+    "dashboard": {"id": null, "title": "Node Exporter Full"},
+    "folderId": 0,
+    "overwrite": true,
+    "inputs": [{"name": "DS_PROMETHEUS", "type": "datasource", "value": "Prometheus"}]
+  }' \
+  http://localhost:3000/api/dashboards/import
+```
+
+### Custom Dashboard: Homelab Overview
+
+```json
+{
+  "title": "Homelab Overview",
+  "panels": [
+    {
+      "title": "Active Hosts",
+      "type": "stat",
+      "targets": [{"expr": "count(up == 1)"}]
+    },
+    {
+      "title": "Running Containers",
+      "type": "stat",
+      "targets": [{"expr": "count(container_last_seen)"}]
+    },
+    {
+      "title": "Total Storage Used",
+      "type": "gauge",
+      "targets": [{"expr": "sum(node_filesystem_size_bytes{fstype!='tmpfs'} - node_filesystem_avail_bytes{fstype!='tmpfs'})"}]
+    },
+    {
+      "title": "Network Traffic",
+      "type": "timeseries",
+      "targets": [
+        {"expr": "sum(rate(node_network_receive_bytes_total[5m]))", "legendFormat": "Received"},
+        {"expr": "sum(rate(node_network_transmit_bytes_total[5m]))", "legendFormat": "Transmitted"}
+      ]
+    }
+  ]
+}
+```
+
+---
+
+## 🔍 Uptime Kuma Setup
+
+### Deploy Uptime Kuma
+
+```yaml
+# uptime-kuma.yaml
+version: "3.8"
+services:
+  uptime-kuma:
+    image: louislam/uptime-kuma:latest
+    container_name: uptime-kuma
+    volumes:
+      - uptime-kuma:/app/data
+    ports:
+      - "3001:3001"
+    restart: unless-stopped
+
+volumes:
+  uptime-kuma:
+```
+
+### Recommended Monitors
+
+| Service | Type | URL/Target | Interval |
+|---------|------|------------|----------|
+| Plex | HTTP | https://plex.vish.gg | 60s |
+| Immich | HTTP | https://immich.vish.gg | 60s |
+| Vaultwarden | HTTP | https://vault.vish.gg | 60s |
+| Atlantis SSH | TCP Port | atlantis:22 | 120s |
+| Pi-hole DNS | DNS | pihole:53 | 60s |
+| Grafana | HTTP | http://grafana:3000 | 60s |
+
+### Status Page Setup
+
+```bash
+# Create public status page
+# Uptime Kuma > Status Pages > Add
+# Add relevant monitors
+# Share URL: https://status.vish.gg
+```
+
+---
+
+## 📜 Log Management with Dozzle
+
+### Deploy Dozzle
+
+```yaml
+# dozzle.yaml
+version: "3.8"
+services:
+  dozzle:
+    image: amir20/dozzle:latest
+    container_name: dozzle
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+    ports:
+      - "8888:8080"
+    environment:
+      - DOZZLE_AUTH_PROVIDER=simple
+      - DOZZLE_USERNAME=admin
+      - DOZZLE_PASSWORD="REDACTED_PASSWORD"
+    restart: unless-stopped
+```
+
+### Multi-Host Log Aggregation
+
+```yaml
+# For monitoring multiple Docker hosts
+# Deploy Dozzle agent on each host:
+
+# dozzle-agent.yaml (on remote hosts)
+version: "3.8"
+services:
+  dozzle-agent:
+    image: amir20/dozzle:latest
+    container_name: dozzle-agent
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+    command: agent
+    environment:
+      - DOZZLE_REMOTE_HOST=tcp://main-dozzle:7007
+    restart: unless-stopped
+```
+
+---
+
+## 📱 Mobile Monitoring
+
+### ntfy Mobile App
+
+1. Install ntfy app (iOS/Android)
+2. Subscribe to topics:
+   - `homelab-alerts` - All alerts
+   - `homelab-critical` - Critical only
+3. Configure notification settings per topic
+
+### Grafana Mobile
+
+1. Access Grafana via Tailscale: `http://grafana.tailnet:3000`
+2. Or expose via reverse proxy with authentication
+3. Create mobile-optimized dashboards
+
+---
+
+## 🔧 Maintenance Tasks
+
+### Weekly
+- [ ] Review alert history for false positives
+- [ ] Check disk space on Prometheus data directory
+- [ ] Verify all scraped targets are healthy
+
+### Monthly
+- [ ] Update Grafana dashboards
+- [ ] Review and tune alert thresholds
+- [ ] Clean up old Prometheus data if needed
+- [ ] Test alerting pipeline
+
+### Quarterly
+- [ ] Review monitoring coverage
+- [ ] Add monitors for new services
+- [ ] Update documentation
+
+---
+
+## 🔗 Related Documentation
+
+- [Performance Troubleshooting](../troubleshooting/performance.md)
+- [Alerting Setup](alerting-setup.md)
+- [Service Architecture](../diagrams/service-architecture.md)
+- [Common Issues](../troubleshooting/common-issues.md)