Files
homelab-optimized/docs/admin/monitoring.md
Gitea Mirror Bot de732aade2
Some checks failed
Documentation / Build Docusaurus (push) Failing after 8s
Documentation / Deploy to GitHub Pages (push) Has been skipped
Sanitized mirror from private repository - 2026-03-13 08:06:19 UTC
2026-03-13 08:06:19 +00:00

16 KiB

📊 Monitoring & Observability Guide

Overview

This guide covers the complete monitoring stack for the homelab, including metrics collection, visualization, alerting, and log management.


🏗️ Monitoring Architecture

┌─────────────────────────────────────────────────────────────────────────────┐
│                        MONITORING STACK                                      │
├─────────────────────────────────────────────────────────────────────────────┤
│                                                                              │
│   ┌─────────────┐    ┌─────────────┐    ┌─────────────┐    ┌─────────────┐ │
│   │  Prometheus │◄───│   Node      │    │   SNMP      │    │  cAdvisor   │ │
│   │  (Metrics)  │    │  Exporter   │    │  Exporter   │    │ (Containers)│ │
│   └──────┬──────┘    └─────────────┘    └─────────────┘    └─────────────┘ │
│          │                                                                   │
│          ▼                                                                   │
│   ┌─────────────┐    ┌─────────────┐                                        │
│   │   Grafana   │    │ Alertmanager│──► ntfy / Signal / Email               │
│   │ (Dashboard) │    │  (Alerts)   │                                        │
│   └─────────────┘    └─────────────┘                                        │
│                                                                              │
│   ┌─────────────┐    ┌─────────────┐                                        │
│   │ Uptime Kuma │    │   Dozzle    │                                        │
│   │  (Status)   │    │   (Logs)    │                                        │
│   └─────────────┘    └─────────────┘                                        │
│                                                                              │
└─────────────────────────────────────────────────────────────────────────────┘

🚀 Quick Setup

Deploy Full Monitoring Stack

# monitoring-stack.yaml
version: "3.8"

services:
  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    volumes:
      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
      - ./prometheus/rules:/etc/prometheus/rules
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=30d'
      - '--web.enable-lifecycle'
    ports:
      - "9090:9090"
    restart: unless-stopped

  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    volumes:
      - grafana_data:/var/lib/grafana
      - ./grafana/provisioning:/etc/grafana/provisioning
    environment:
      - GF_SECURITY_ADMIN_PASSWORD="REDACTED_PASSWORD"
      - GF_USERS_ALLOW_SIGN_UP=false
    ports:
      - "3000:3000"
    restart: unless-stopped

  alertmanager:
    image: prom/alertmanager:latest
    container_name: alertmanager
    volumes:
      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
    ports:
      - "9093:9093"
    restart: unless-stopped

  node-exporter:
    image: prom/node-exporter:latest
    container_name: node-exporter
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
    ports:
      - "9100:9100"
    restart: unless-stopped

  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    container_name: cadvisor
    privileged: true
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
    ports:
      - "8080:8080"
    restart: unless-stopped

volumes:
  prometheus_data:
  grafana_data:

📈 Prometheus Configuration

Main Configuration

# prometheus/prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093

rule_files:
  - /etc/prometheus/rules/*.yml

scrape_configs:
  # Prometheus self-monitoring
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  # Node exporters (Linux hosts)
  - job_name: 'node'
    static_configs:
      - targets:
        - 'node-exporter:9100'
        - 'homelab-vm:9100'
        - 'guava:9100'
        - 'anubis:9100'

  # Synology NAS via SNMP
  - job_name: 'synology'
    static_configs:
      - targets:
        - 'atlantis:9116'
        - 'calypso:9116'
        - 'setillo:9116'
    metrics_path: /snmp
    params:
      module: [synology]
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: snmp-exporter:9116

  # Docker containers via cAdvisor
  - job_name: 'cadvisor'
    static_configs:
      - targets:
        - 'cadvisor:8080'
        - 'atlantis:8080'
        - 'calypso:8080'

  # Blackbox exporter for HTTP probes
  - job_name: 'blackbox'
    metrics_path: /probe
    params:
      module: [http_2xx]
    static_configs:
      - targets:
        - https://plex.vish.gg
        - https://immich.vish.gg
        - https://vault.vish.gg
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: blackbox-exporter:9115

  # Watchtower metrics
  - job_name: 'watchtower'
    bearer_token: "REDACTED_TOKEN"
    static_configs:
      - targets:
        - 'atlantis:8080'
        - 'calypso:8080'

Alert Rules

# prometheus/rules/alerts.yml
groups:
  - name: infrastructure
    rules:
      # Host down
      - alert: HostDown
        expr: up == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Host {{ $labels.instance }} is down"
          description: "{{ $labels.instance }} has been unreachable for 2 minutes."

      # High CPU
      - alert: HostHighCpuLoad
        expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU load on {{ $labels.instance }}"
          description: "CPU load is {{ $value | printf \"%.2f\" }}%"

      # Low memory
      - alert: HostOutOfMemory
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Host out of memory: {{ $labels.instance }}"
          description: "Memory usage is {{ $value | printf \"%.2f\" }}%"

      # Disk space
      - alert: HostOutOfDiskSpace
        expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Disk space low on {{ $labels.instance }}"
          description: "Disk usage is {{ $value | printf \"%.2f\" }}% on {{ $labels.mountpoint }}"

      # Disk will fill
      - alert: HostDiskWillFillIn24Hours
        expr: predict_linear(node_filesystem_avail_bytes{fstype!="tmpfs"}[6h], 24*60*60) < 0
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "Disk will fill in 24 hours on {{ $labels.instance }}"

  - name: containers
    rules:
      # Container down
      - alert: ContainerDown
        expr: absent(container_last_seen{name=~".+"})
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container {{ $labels.name }} is down"

      # Container high CPU
      - alert: REDACTED_APP_PASSWORD
        expr: (sum by(name) (rate(container_cpu_usage_seconds_total[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container {{ $labels.name }} high CPU"
          description: "CPU usage is {{ $value | printf \"%.2f\" }}%"

      # Container high memory
      - alert: ContainerHighMemory
        expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container {{ $labels.name }} high memory"

  - name: services
    rules:
      # SSL certificate expiring
      - alert: SSLCertificateExpiringSoon
        expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 14
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "SSL certificate expiring soon for {{ $labels.instance }}"
          description: "Certificate expires in {{ $value | REDACTED_APP_PASSWORD }}"

      # HTTP probe failed
      - alert: ServiceDown
        expr: probe_success == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.instance }} is down"

🔔 Alertmanager Configuration

Basic Setup with ntfy

# alertmanager/alertmanager.yml
global:
  resolve_timeout: 5m

route:
  group_by: ['alertname', 'severity']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 4h
  receiver: 'ntfy'
  
  routes:
    # Critical alerts - immediate
    - match:
        severity: critical
      receiver: 'ntfy-critical'
      repeat_interval: 1h
    
    # Warning alerts
    - match:
        severity: warning
      receiver: 'ntfy'
      repeat_interval: 4h

receivers:
  - name: 'ntfy'
    webhook_configs:
      - url: 'http://ntfy:80/homelab-alerts'
        send_resolved: true

  - name: 'ntfy-critical'
    webhook_configs:
      - url: 'http://ntfy:80/homelab-critical'
        send_resolved: true

inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'instance']

ntfy Integration Script

#!/usr/bin/env python3
# alertmanager-ntfy-bridge.py
from flask import Flask, request
import requests
import json

app = Flask(__name__)

NTFY_URL = "http://ntfy:80"

@app.route('/webhook', methods=['POST'])
def webhook():
    data = request.json
    
    for alert in data.get('alerts', []):
        status = alert['status']
        labels = alert['labels']
        annotations = alert.get('annotations', {})
        
        title = f"[{status.upper()}] {labels.get('alertname', 'Alert')}"
        message = annotations.get('description', annotations.get('summary', 'No description'))
        
        priority = "high" if labels.get('severity') == 'critical' else "default"
        
        requests.post(
            f"{NTFY_URL}/homelab-alerts",
            headers={
                "Title": title,
                "Priority": priority,
                "Tags": "warning" if status == "firing" else "white_check_mark"
            },
            data=message
        )
    
    return "OK", 200

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)

📊 Grafana Dashboards

Essential Dashboards

Dashboard ID Description
Node Exporter Full 1860 Complete Linux host metrics
Docker Containers 893 Container resource usage
Synology NAS 14284 Synology SNMP metrics
Blackbox Exporter 7587 HTTP/ICMP probe results
Prometheus Stats 3662 Prometheus self-monitoring

Import Dashboards

# Via Grafana API
curl -X POST \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer $GRAFANA_API_KEY" \
  -d '{
    "dashboard": {"id": null, "title": "Node Exporter Full"},
    "folderId": 0,
    "overwrite": true,
    "inputs": [{"name": "DS_PROMETHEUS", "type": "datasource", "value": "Prometheus"}]
  }' \
  http://localhost:3000/api/dashboards/import

Custom Dashboard: Homelab Overview

{
  "title": "Homelab Overview",
  "panels": [
    {
      "title": "Active Hosts",
      "type": "stat",
      "targets": [{"expr": "count(up == 1)"}]
    },
    {
      "title": "Running Containers",
      "type": "stat",
      "targets": [{"expr": "count(container_last_seen)"}]
    },
    {
      "title": "Total Storage Used",
      "type": "gauge",
      "targets": [{"expr": "sum(node_filesystem_size_bytes{fstype!='tmpfs'} - node_filesystem_avail_bytes{fstype!='tmpfs'})"}]
    },
    {
      "title": "Network Traffic",
      "type": "timeseries",
      "targets": [
        {"expr": "sum(rate(node_network_receive_bytes_total[5m]))", "legendFormat": "Received"},
        {"expr": "sum(rate(node_network_transmit_bytes_total[5m]))", "legendFormat": "Transmitted"}
      ]
    }
  ]
}

🔍 Uptime Kuma Setup

Deploy Uptime Kuma

# uptime-kuma.yaml
version: "3.8"
services:
  uptime-kuma:
    image: louislam/uptime-kuma:latest
    container_name: uptime-kuma
    volumes:
      - uptime-kuma:/app/data
    ports:
      - "3001:3001"
    restart: unless-stopped

volumes:
  uptime-kuma:
Service Type URL/Target Interval
Plex HTTP https://plex.vish.gg 60s
Immich HTTP https://immich.vish.gg 60s
Vaultwarden HTTP https://vault.vish.gg 60s
Atlantis SSH TCP Port atlantis:22 120s
Pi-hole DNS DNS pihole:53 60s
Grafana HTTP http://grafana:3000 60s

Status Page Setup

# Create public status page
# Uptime Kuma > Status Pages > Add
# Add relevant monitors
# Share URL: https://status.vish.gg

📜 Log Management with Dozzle

Deploy Dozzle

# dozzle.yaml
version: "3.8"
services:
  dozzle:
    image: amir20/dozzle:latest
    container_name: dozzle
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
    ports:
      - "8888:8080"
    environment:
      - DOZZLE_AUTH_PROVIDER=simple
      - DOZZLE_USERNAME=admin
      - DOZZLE_PASSWORD="REDACTED_PASSWORD"
    restart: unless-stopped

Multi-Host Log Aggregation

# For monitoring multiple Docker hosts
# Deploy Dozzle agent on each host:

# dozzle-agent.yaml (on remote hosts)
version: "3.8"
services:
  dozzle-agent:
    image: amir20/dozzle:latest
    container_name: dozzle-agent
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
    command: agent
    environment:
      - DOZZLE_REMOTE_HOST=tcp://main-dozzle:7007
    restart: unless-stopped

📱 Mobile Monitoring

ntfy Mobile App

  1. Install ntfy app (iOS/Android)
  2. Subscribe to topics:
    • homelab-alerts - All alerts
    • homelab-critical - Critical only
  3. Configure notification settings per topic

Grafana Mobile

  1. Access Grafana via Tailscale: http://grafana.tailnet:3000
  2. Or expose via reverse proxy with authentication
  3. Create mobile-optimized dashboards

🔧 Maintenance Tasks

Weekly

  • Review alert history for false positives
  • Check disk space on Prometheus data directory
  • Verify all scraped targets are healthy

Monthly

  • Update Grafana dashboards
  • Review and tune alert thresholds
  • Clean up old Prometheus data if needed
  • Test alerting pipeline

Quarterly

  • Review monitoring coverage
  • Add monitors for new services
  • Update documentation