# Prometheus + Grafana Monitoring Stack - Portainer GitOps Version # ============================================================================= # NOTE: The live deployment is monitoring-compose.yml (plain docker compose, # bind-mounted configs at /home/homelab/docker/monitoring/). # This file is the self-contained Portainer GitOps version (embedded configs). # Stack 476 on endpoint 443399 no longer exists in Portainer. # ============================================================================= # Ports: 9090 (Prometheus), 3300 (Grafana), 9116 (SNMP Exporter) # # Uses docker configs for prometheus.yml and snmp.yml since bind mounts have # symlink issues with Portainer git deploy # # Dashboard Provisioning: # - Datasources: Auto-configured Prometheus # - Dashboards: Infrastructure Overview, Synology NAS, Node Exporter Full (from Grafana.com) # # Old/deprecated configs have been moved to: archive/deprecated-monitoring-stacks/ configs: # Grafana Datasource Provisioning grafana_datasources: content: | apiVersion: 1 datasources: - name: Prometheus type: prometheus access: proxy url: http://prometheus:9090 isDefault: true editable: true # Grafana Dashboard Provisioning Config grafana_dashboards_config: content: | apiVersion: 1 providers: - name: 'default' orgId: 1 folder: 'Provisioned' folderUid: 'provisioned' type: file disableDeletion: false updateIntervalSeconds: 30 allowUiUpdates: true options: path: /etc/grafana/provisioning/dashboards/json # Infrastructure Overview Dashboard dashboard_infrastructure: content: | { "uid": "infrastructure-overview-v2", "title": "Infrastructure Overview - All Devices", "tags": ["infrastructure", "node-exporter", "tailscale"], "timezone": "browser", "schemaVersion": 38, "version": 1, "refresh": "30s", "templating": { "list": [ { "current": {}, "hide": 0, "includeAll": false, "label": "Data Source", "multi": false, "name": "datasource", "options": [], "query": "prometheus", "refresh": 1, "type": "datasource" }, { "allValue": "", "current": {}, "datasource": {"type": "prometheus", "uid": "${datasource}"}, "definition": "label_values(node_uname_info, job)", "hide": 0, "includeAll": true, "label": "Host", "multi": true, "name": "job", "query": "label_values(node_uname_info, job)", "refresh": 1, "regex": "", "sort": 1, "type": "query" } ] }, "panels": [ { "id": 1, "type": "stat", "title": "Device Status", "gridPos": {"h": 5, "w": 24, "x": 0, "y": 0}, "datasource": {"type": "prometheus", "uid": "${datasource}"}, "fieldConfig": { "defaults": { "mappings": [{"type": "value", "options": {"0": {"text": "DOWN", "color": "red"}, "1": {"text": "UP", "color": "green"}}}], "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]} } }, "options": {"colorMode": "background", "textMode": "value_and_name", "orientation": "horizontal", "reduceOptions": {"calcs": ["lastNotNull"]}}, "targets": [{"expr": "up{job=~\"$job\"}", "legendFormat": "{{job}}", "refId": "A"}] }, { "id": 2, "type": "timeseries", "title": "CPU Usage", "gridPos": {"h": 8, "w": 12, "x": 0, "y": 5}, "datasource": {"type": "prometheus", "uid": "${datasource}"}, "fieldConfig": {"defaults": {"unit": "percent", "max": 100, "min": 0}}, "options": {"legend": {"displayMode": "table", "placement": "right", "calcs": ["mean", "max"]}}, "targets": [{"expr": "100 - (avg by(job) (rate(node_cpu_seconds_total{mode=\"idle\", job=~\"$job\"}[5m])) * 100)", "legendFormat": "{{job}}", "refId": "A"}] }, { "id": 3, "type": "timeseries", "title": "Memory Usage", "gridPos": {"h": 8, "w": 12, "x": 12, "y": 5}, "datasource": {"type": "prometheus", "uid": "${datasource}"}, "fieldConfig": {"defaults": {"unit": "percent", "max": 100, "min": 0}}, "options": {"legend": {"displayMode": "table", "placement": "right", "calcs": ["mean", "max"]}}, "targets": [{"expr": "(1 - (node_memory_MemAvailable_bytes{job=~\"$job\"} / node_memory_MemTotal_bytes{job=~\"$job\"})) * 100", "legendFormat": "{{job}}", "refId": "A"}] }, { "id": 4, "type": "bargauge", "title": "Root Disk Usage", "gridPos": {"h": 8, "w": 12, "x": 0, "y": 13}, "datasource": {"type": "prometheus", "uid": "${datasource}"}, "fieldConfig": { "defaults": { "unit": "percent", "max": 100, "min": 0, "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 70}, {"color": "red", "value": 85}]} } }, "options": {"displayMode": "gradient", "orientation": "horizontal", "reduceOptions": {"calcs": ["lastNotNull"]}}, "targets": [{"expr": "100 - ((node_filesystem_avail_bytes{job=~\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"} / node_filesystem_size_bytes{job=~\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"}) * 100)", "legendFormat": "{{job}}", "refId": "A"}] }, { "id": 5, "type": "stat", "title": "Uptime", "gridPos": {"h": 8, "w": 12, "x": 12, "y": 13}, "datasource": {"type": "prometheus", "uid": "${datasource}"}, "fieldConfig": {"defaults": {"unit": "s", "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}}}, "options": {"colorMode": "value", "orientation": "horizontal", "reduceOptions": {"calcs": ["lastNotNull"]}}, "targets": [{"expr": "node_time_seconds{job=~\"$job\"} - node_boot_time_seconds{job=~\"$job\"}", "legendFormat": "{{job}}", "refId": "A"}] }, { "id": 6, "type": "timeseries", "title": "Network Receive", "gridPos": {"h": 8, "w": 12, "x": 0, "y": 21}, "datasource": {"type": "prometheus", "uid": "${datasource}"}, "fieldConfig": {"defaults": {"unit": "Bps"}}, "options": {"legend": {"displayMode": "table", "placement": "right", "calcs": ["mean", "max"]}}, "targets": [{"expr": "sum by(job) (rate(node_network_receive_bytes_total{job=~\"$job\", device!~\"lo|docker.*|br-.*|veth.*\"}[5m]))", "legendFormat": "{{job}}", "refId": "A"}] }, { "id": 7, "type": "timeseries", "title": "Network Transmit", "gridPos": {"h": 8, "w": 12, "x": 12, "y": 21}, "datasource": {"type": "prometheus", "uid": "${datasource}"}, "fieldConfig": {"defaults": {"unit": "Bps"}}, "options": {"legend": {"displayMode": "table", "placement": "right", "calcs": ["mean", "max"]}}, "targets": [{"expr": "sum by(job) (rate(node_network_transmit_bytes_total{job=~\"$job\", device!~\"lo|docker.*|br-.*|veth.*\"}[5m]))", "legendFormat": "{{job}}", "refId": "A"}] } ] } # Synology NAS Monitoring Dashboard # Synology NAS Monitoring Dashboard (FIXED - All datasource UIDs and template variables corrected) dashboard_synology: content: | { "id": 3, "panels": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "mappings": [ { "options": { "1": { "color": "green", "text": "Normal" }, "2": { "color": "red", "text": "Failed" } }, "type": "value" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 2 } ] } } }, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 0 }, "id": 1, "options": { "colorMode": "background", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ] }, "textMode": "value_and_name" }, "targets": [ { "expr": "systemStatus{instance=~\"\"}", "legendFormat": "{{instance}}", "refId": "A" } ], "title": "NAS Status", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "max": 80, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 50 }, { "color": "red", "value": 65 } ] }, "unit": "celsius" } }, "gridPos": { "h": 6, "w": 8, "x": 0, "y": 4 }, "id": 2, "options": { "reduceOptions": { "calcs": [ "lastNotNull" ] } }, "targets": [ { "expr": "temperature{instance=~\"\"}", "legendFormat": "{{instance}}", "refId": "A" } ], "title": "Temperature", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 90 } ] }, "unit": "percent" } }, "gridPos": { "h": 6, "w": 8, "x": 8, "y": 4 }, "id": 3, "options": { "reduceOptions": { "calcs": [ "lastNotNull" ] } }, "targets": [ { "expr": "((memTotalReal{instance=~\"\"} - memAvailReal{instance=~\"\"}) / memTotalReal{instance=~\"\"}) * 100", "legendFormat": "{{instance}}", "refId": "A" } ], "title": "Memory Usage", "type": "gauge" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "thresholds": { "mode": "absolute", "steps": [ { "color": "blue", "value": null } ] }, "unit": "decbytes" } }, "gridPos": { "h": 6, "w": 8, "x": 16, "y": 4 }, "id": 4, "options": { "colorMode": "value", "graphMode": "none", "reduceOptions": { "calcs": [ "lastNotNull" ] } }, "targets": [ { "expr": "memTotalReal{instance=~\"\"} * 1024", "legendFormat": "{{instance}}", "refId": "A" } ], "title": "Total Memory", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 40 }, { "color": "red", "value": 50 } ] }, "unit": "celsius" } }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 10 }, "id": 5, "options": { "colorMode": "value", "graphMode": "area", "reduceOptions": { "calcs": [ "lastNotNull" ] } }, "targets": [ { "expr": "diskTemperature{instance=~\"\"}", "legendFormat": "{{instance}} - Disk {{diskIndex}}", "refId": "A" } ], "title": "Disk Temperature", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "mappings": [ { "options": { "1": { "color": "green", "text": "Normal" }, "11": { "color": "orange", "text": "Degraded" }, "12": { "color": "red", "text": "Crashed" }, "2": { "color": "yellow", "text": "Repairing" }, "3": { "color": "yellow", "text": "Migrating" }, "4": { "color": "yellow", "text": "Expanding" }, "5": { "color": "orange", "text": "Deleting" }, "6": { "color": "blue", "text": "Creating" } }, "type": "value" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } } }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 10 }, "id": 6, "options": { "colorMode": "background", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ] }, "textMode": "value_and_name" }, "targets": [ { "expr": "raidStatus{instance=~\"\"}", "legendFormat": "{{instance}} - {{raidIndex}}", "refId": "A" } ], "title": "RAID Status", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "max": 100, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 70 }, { "color": "red", "value": 85 } ] }, "unit": "percent" } }, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 }, "id": 7, "options": { "displayMode": "gradient", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ] } }, "targets": [ { "expr": "((raidTotalSize{instance=~\"\"} - raidFreeSize{instance=~\"\"}) / raidTotalSize{instance=~\"\"}) * 100", "legendFormat": "{{instance}} - RAID {{raidIndex}}", "refId": "A" } ], "title": "RAID Usage", "type": "bargauge" }, { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "fieldConfig": { "defaults": { "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "dtdurations" } }, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 24 }, "id": 8, "options": { "colorMode": "value", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ] } }, "targets": [ { "expr": "sysUpTime{instance=~\"\"} / 100", "legendFormat": "{{instance}}", "refId": "A" } ], "title": "Uptime", "type": "stat" } ], "refresh": "30s", "schemaVersion": 38, "tags": [ "synology", "nas", "snmp" ], "templating": { "list": [ { "current": { "text": "Prometheus", "value": "PREDACTED_APP_PASSWORD" }, "hide": 0, "includeAll": false, "label": "Data Source", "multi": false, "name": "datasource", "options": [], "query": "prometheus", "refresh": 1, "type": "datasource" }, { "allValue": "", "current": { "text": "All", "value": "$__all" }, "datasource": { "type": "prometheus", "uid": "${datasource}" }, "definition": "label_values(diskTemperature, instance)", "hide": 0, "includeAll": true, "label": "NAS", "multi": true, "name": "instance", "query": "label_values(diskTemperature, instance)", "refresh": 1, "regex": "", "sort": 1, "type": "query" } ] }, "timezone": "browser", "title": "Synology NAS Monitoring", "uid": "synology-dashboard-v2", "version": 4 } prometheus_config: content: | global: scrape_interval: 15s evaluation_interval: 15s alerting: alertmanagers: - static_configs: - targets: - alertmanager:9093 rule_files: - /etc/prometheus/alert-rules.yml scrape_configs: - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] - job_name: 'node_exporter' static_configs: - targets: ['host.docker.internal:9100'] relabel_configs: - target_label: instance replacement: 'homelab-vm' - job_name: 'homelab-node' static_configs: - targets: ['100.67.40.126:9100'] relabel_configs: - target_label: instance replacement: 'homelab-vm' - job_name: 'raspberry-pis' static_configs: - targets: ['100.77.151.40:9100'] # pi-5-kevin (100.123.246.75) removed - offline 127+ days relabel_configs: - target_label: instance replacement: 'pi-5' - job_name: 'setillo-node' static_configs: - targets: ['100.125.0.20:9100'] relabel_configs: - target_label: instance replacement: 'setillo' - job_name: 'setillo-snmp' metrics_path: /snmp params: module: [synology] auth: [snmpv3] target: ['127.0.0.1'] static_configs: - targets: ['100.125.0.20:9116'] relabel_configs: - source_labels: [__address__] target_label: __param_target replacement: '127.0.0.1' - source_labels: [__param_target] target_label: instance replacement: 'setillo' - target_label: __address__ replacement: '100.125.0.20:9116' - job_name: 'calypso-node' static_configs: - targets: ['100.103.48.78:9100'] relabel_configs: - target_label: instance replacement: 'calypso' - job_name: 'calypso-snmp' metrics_path: /snmp params: module: [synology] auth: [snmpv3] target: ['127.0.0.1'] static_configs: - targets: ['100.103.48.78:9116'] relabel_configs: - source_labels: [__address__] target_label: __param_target replacement: '127.0.0.1' - source_labels: [__param_target] target_label: instance replacement: 'calypso' - target_label: __address__ replacement: '100.103.48.78:9116' - job_name: 'atlantis-node' static_configs: - targets: ['100.83.230.112:9100'] relabel_configs: - target_label: instance replacement: 'atlantis' - job_name: 'atlantis-snmp' metrics_path: /snmp params: module: [synology] auth: [snmpv3] target: ['127.0.0.1'] static_configs: - targets: ['100.83.230.112:9116'] relabel_configs: - source_labels: [__address__] target_label: __param_target replacement: '127.0.0.1' - source_labels: [__param_target] target_label: instance replacement: 'atlantis' - target_label: __address__ replacement: '100.83.230.112:9116' - job_name: 'concord-nuc-node' static_configs: - targets: ['100.72.55.21:9100'] relabel_configs: - target_label: instance replacement: 'concord-nuc' - job_name: 'truenas-node' static_configs: - targets: ['100.75.252.64:9100'] relabel_configs: - target_label: instance replacement: 'guava' - job_name: 'seattle-node' static_configs: - targets: ['100.82.197.124:9100'] relabel_configs: - target_label: instance replacement: 'seattle' - job_name: 'proxmox-node' static_configs: - targets: ['100.87.12.28:9100'] relabel_configs: - target_label: instance replacement: 'proxmox' snmp_config: content: | auths: snmpv3: version: 3 security_level: authPriv auth_protocol: MD5 username: snmp-exporter password: "REDACTED_PASSWORD" priv_protocol: DES priv_password: "REDACTED_PASSWORD" modules: synology: walk: - 1.3.6.1.2.1.1 - 1.3.6.1.2.1.2 - 1.3.6.1.2.1.25.2 - 1.3.6.1.2.1.25.3.3 - 1.3.6.1.2.1.31.1.1 - 1.3.6.1.4.1.2021.4 - 1.3.6.1.4.1.2021.10 - 1.3.6.1.4.1.2021.11 - 1.3.6.1.4.1.6574.1 - 1.3.6.1.4.1.6574.2 - 1.3.6.1.4.1.6574.3 - 1.3.6.1.4.1.6574.4 - 1.3.6.1.4.1.6574.5 - 1.3.6.1.4.1.6574.6 - 1.3.6.1.4.1.6574.101 - 1.3.6.1.4.1.6574.102 metrics: - name: sysDescr oid: 1.3.6.1.2.1.1.1 type: DisplayString - name: sysUpTime oid: 1.3.6.1.2.1.1.3 type: gauge - name: sysName oid: 1.3.6.1.2.1.1.5 type: DisplayString - name: ssCpuRawUser oid: 1.3.6.1.4.1.2021.11.50 type: counter - name: ssCpuRawSystem oid: 1.3.6.1.4.1.2021.11.52 type: counter - name: ssCpuRawIdle oid: 1.3.6.1.4.1.2021.11.53 type: counter - name: memTotalSwap oid: 1.3.6.1.4.1.2021.4.3 type: gauge - name: memAvailSwap oid: 1.3.6.1.4.1.2021.4.4 type: gauge - name: memTotalReal oid: 1.3.6.1.4.1.2021.4.5 type: gauge - name: memAvailReal oid: 1.3.6.1.4.1.2021.4.6 type: gauge - name: systemStatus oid: 1.3.6.1.4.1.6574.1.1 type: gauge - name: temperature oid: 1.3.6.1.4.1.6574.1.2 type: gauge - name: powerStatus oid: 1.3.6.1.4.1.6574.1.3 type: gauge - name: modelName oid: 1.3.6.1.4.1.6574.1.5.1 type: DisplayString - name: version oid: 1.3.6.1.4.1.6574.1.5.3 type: DisplayString - name: diskID oid: 1.3.6.1.4.1.6574.2.1.1.2 type: DisplayString indexes: - labelname: diskIndex type: gauge - name: diskStatus oid: 1.3.6.1.4.1.6574.2.1.1.5 type: gauge indexes: - labelname: diskIndex type: gauge - name: diskTemperature oid: 1.3.6.1.4.1.6574.2.1.1.6 type: gauge indexes: - labelname: diskIndex type: gauge - name: raidName oid: 1.3.6.1.4.1.6574.3.1.1.2 type: DisplayString indexes: - labelname: raidIndex type: gauge - name: raidStatus oid: 1.3.6.1.4.1.6574.3.1.1.3 type: gauge indexes: - labelname: raidIndex type: gauge - name: raidFreeSize oid: 1.3.6.1.4.1.6574.3.1.1.4 type: gauge indexes: - labelname: raidIndex type: gauge - name: raidTotalSize oid: 1.3.6.1.4.1.6574.3.1.1.5 type: gauge indexes: - labelname: raidIndex type: gauge services: prometheus: image: prom/prometheus:latest container_name: prometheus configs: - source: prometheus_config target: /etc/prometheus/prometheus.yml volumes: - prometheus-data:/prometheus command: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.path=/prometheus" - "--web.enable-lifecycle" ports: - "9090:9090" restart: unless-stopped networks: - monitoring extra_hosts: - "host.docker.internal:host-gateway" grafana: image: grafana/grafana-oss:12.4.0 container_name: grafana environment: - GF_SECURITY_ADMIN_USER=admin - GF_SECURITY_ADMIN_PASSWORD="REDACTED_PASSWORD" # Disable Grafana 12 unified storage feature to restore home dashboard env var support - GF_FEATURE_TOGGLES_DISABLE=kubernetesDashboards # Authentik OAuth2 SSO Configuration - GF_AUTH_GENERIC_OAUTH_ENABLED=true - GF_AUTH_GENERIC_OAUTH_NAME=Authentik - GF_AUTH_GENERIC_OAUTH_CLIENT_ID="REDACTED_CLIENT_ID" - GF_AUTH_GENERIC_OAUTH_CLIENT_SECRET="REDACTED_CLIENT_SECRET" - GF_AUTH_GENERIC_OAUTH_SCOPES=openid profile email - GF_AUTH_GENERIC_OAUTH_AUTH_URL=https://sso.vish.gg/application/o/authorize/ - GF_AUTH_GENERIC_OAUTH_TOKEN_URL=https://sso.vish.gg/application/o/token/ - GF_AUTH_GENERIC_OAUTH_API_URL=https://sso.vish.gg/application/o/userinfo/ - GF_AUTH_SIGNOUT_REDIRECT_URL=https://sso.vish.gg/application/o/grafana/end-session/ - GF_AUTH_GENERIC_OAUTH_ROLE_ATTRIBUTE_PATH=contains(groups[*], 'Grafana Admins') && 'Admin' || contains(groups[*], 'Grafana Editors') && 'Editor' || 'Viewer' # Required for Authentik - extract email and login from userinfo response - GF_AUTH_GENERIC_OAUTH_EMAIL_ATTRIBUTE_PATH=email - GF_AUTH_GENERIC_OAUTH_LOGIN_ATTRIBUTE_PATH=preferred_username - GF_AUTH_GENERIC_OAUTH_NAME_ATTRIBUTE_PATH=name - GF_SERVER_ROOT_URL=https://gf.vish.gg # Home dashboard is set via org preferences in Grafana DB (node-details-v2) # GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH is not used - home is DB-persisted via API configs: # Datasource provisioning - source: grafana_datasources target: /etc/grafana/provisioning/datasources/datasources.yaml # Dashboard provider config - source: grafana_dashboards_config target: /etc/grafana/provisioning/dashboards/dashboards.yaml # Dashboard JSON files - source: dashboard_infrastructure target: /etc/grafana/provisioning/dashboards/json/infrastructure-overview.json - source: dashboard_synology target: /etc/grafana/provisioning/dashboards/json/synology-monitoring.json volumes: - grafana-data:/var/lib/grafana ports: - "3300:3000" restart: unless-stopped depends_on: - prometheus networks: - monitoring node_exporter: image: prom/node-exporter:latest container_name: node_exporter network_mode: host pid: host volumes: - /:/host:ro,rslave - /sys:/host/sys:ro - /proc:/host/proc:ro command: - '--path.rootfs=/host' restart: unless-stopped snmp_exporter: image: prom/snmp-exporter:latest container_name: snmp_exporter configs: - source: snmp_config target: /etc/snmp_exporter/snmp.yml ports: - "9116:9116" restart: unless-stopped networks: - monitoring volumes: prometheus-data: grafana-data: networks: monitoring: driver: bridge