Sanitized mirror from private repository - 2026-04-18 11:19:59 UTC
This commit is contained in:
40
archive/deprecated-monitoring-stacks/README.md
Normal file
40
archive/deprecated-monitoring-stacks/README.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# Deprecated Monitoring Stacks
|
||||
|
||||
These monitoring configurations are **DEPRECATED** and should not be used.
|
||||
|
||||
## Current Working Stack
|
||||
|
||||
The current working monitoring stack is located at:
|
||||
- **`homelab_vm/monitoring.yaml`**
|
||||
|
||||
This stack is deployed via Portainer GitOps to the homelab-vm and includes:
|
||||
- Prometheus with all scrape targets
|
||||
- Grafana
|
||||
- Node Exporter
|
||||
- SNMP Exporter for Synology NAS devices
|
||||
|
||||
## Archived Configurations
|
||||
|
||||
The following directories contain old/deprecated monitoring configurations that were used before the consolidated stack:
|
||||
|
||||
### `prometheus_grafana_hub/`
|
||||
Old monitoring hub setup with separate docker-compose files for each host.
|
||||
- Used bind mounts which caused issues with Portainer git deploy
|
||||
- Had separate compose files for each Synology NAS
|
||||
- **Status: DEPRECATED** - Replaced by `homelab_vm/monitoring.yaml`
|
||||
|
||||
### `stacks-monitoring/`
|
||||
Another old monitoring stack attempt.
|
||||
- Used separate directories for prometheus and grafana configs
|
||||
- **Status: DEPRECATED** - Replaced by `homelab_vm/monitoring.yaml`
|
||||
|
||||
### `prometheus/`
|
||||
Standalone prometheus config directory.
|
||||
- **Status: DEPRECATED** - Config now embedded in `homelab_vm/monitoring.yaml`
|
||||
|
||||
### `grafana/`
|
||||
Standalone grafana provisioning configs.
|
||||
- **Status: DEPRECATED** - Dashboards now managed directly in Grafana
|
||||
|
||||
## Migration Date
|
||||
Archived on: $(date +%Y-%m-%d)
|
||||
@@ -0,0 +1,366 @@
|
||||
{
|
||||
"uid": "infrastructure-overview-v2",
|
||||
"title": "Infrastructure Overview - All Devices",
|
||||
"tags": [
|
||||
"infrastructure",
|
||||
"node-exporter",
|
||||
"tailscale"
|
||||
],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 38,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Data Source",
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": "",
|
||||
"current": {},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"definition": "label_values(node_uname_info, job)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Host",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values(node_uname_info, job)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Device Status",
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": {
|
||||
"text": "DOWN",
|
||||
"color": "red"
|
||||
},
|
||||
"1": {
|
||||
"text": "UP",
|
||||
"color": "green"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"textMode": "value_and_name",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{job=~\"$job\"}",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 5
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"min": 0
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (avg by(job) (rate(node_cpu_seconds_total{mode=\"idle\", job=~\"$job\"}[5m])) * 100)",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 5
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"min": 0
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(1 - (node_memory_MemAvailable_bytes{job=~\"$job\"} / node_memory_MemTotal_bytes{job=~\"$job\"})) * 100",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "bargauge",
|
||||
"title": "Root Disk Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 13
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 70
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 85
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"displayMode": "gradient",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - ((node_filesystem_avail_bytes{job=~\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"} / node_filesystem_size_bytes{job=~\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"}) * 100)",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Uptime",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 13
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_time_seconds{job=~\"$job\"} - node_boot_time_seconds{job=~\"$job\"}",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "timeseries",
|
||||
"title": "Network Receive",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 21
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(job) (rate(node_network_receive_bytes_total{job=~\"$job\", device!~\"lo|docker.*|br-.*|veth.*\"}[5m]))",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"type": "timeseries",
|
||||
"title": "Network Transmit",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 21
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(job) (rate(node_network_transmit_bytes_total{job=~\"$job\", device!~\"lo|docker.*|br-.*|veth.*\"}[5m]))",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,936 @@
|
||||
{
|
||||
"uid": "node-details-v2",
|
||||
"title": "Node Details - Full Metrics",
|
||||
"tags": [
|
||||
"node-exporter",
|
||||
"detailed",
|
||||
"infrastructure"
|
||||
],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 38,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": "prometheus",
|
||||
"value": "prometheus"
|
||||
},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Data Source",
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"current": {},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"definition": "label_values(node_uname_info, job)",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Host",
|
||||
"multi": false,
|
||||
"name": "job",
|
||||
"options": [],
|
||||
"query": "label_values(node_uname_info, job)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"current": {},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"definition": "label_values(node_uname_info{job=\"$job\"}, instance)",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Instance",
|
||||
"multi": false,
|
||||
"name": "instance",
|
||||
"options": [],
|
||||
"query": "label_values(node_uname_info{job=\"$job\"}, instance)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "row",
|
||||
"title": "\ud83d\udcca Quick Stats",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Uptime",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 4,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_time_seconds{job=\"$job\",instance=\"$instance\"} - node_boot_time_seconds{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Uptime",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "CPU Cores",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 3,
|
||||
"x": 4,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "blue",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"idle\"})",
|
||||
"legendFormat": "Cores",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Total RAM",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 3,
|
||||
"x": 7,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "purple",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_memory_MemTotal_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "RAM",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "gauge",
|
||||
"title": "CPU",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 3,
|
||||
"x": 10,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 60
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (avg(rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"idle\"}[5m])) * 100)",
|
||||
"legendFormat": "CPU",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "gauge",
|
||||
"title": "Memory",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 3,
|
||||
"x": 13,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 70
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 85
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(1 - (node_memory_MemAvailable_bytes{job=\"$job\",instance=\"$instance\"} / node_memory_MemTotal_bytes{job=\"$job\",instance=\"$instance\"})) * 100",
|
||||
"legendFormat": "Memory",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"type": "gauge",
|
||||
"title": "Disk /",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 3,
|
||||
"x": 16,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 70
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 85
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - ((node_filesystem_avail_bytes{job=\"$job\",instance=\"$instance\",mountpoint=\"/\",fstype!=\"rootfs\"} / node_filesystem_size_bytes{job=\"$job\",instance=\"$instance\",mountpoint=\"/\",fstype!=\"rootfs\"}) * 100)",
|
||||
"legendFormat": "Disk",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "stat",
|
||||
"title": "Load 1m",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 2,
|
||||
"x": 19,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"decimals": 2,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_load1{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "1m",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"type": "stat",
|
||||
"title": "Load 5m",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 2,
|
||||
"x": 21,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"decimals": 2,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_load5{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "5m",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"type": "row",
|
||||
"title": "\ud83d\udda5\ufe0f CPU Details",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 5
|
||||
},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage Breakdown",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 6
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"custom": {
|
||||
"fillOpacity": 50,
|
||||
"stacking": {
|
||||
"mode": "normal",
|
||||
"group": "A"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg(rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"user\"}[5m])) * 100",
|
||||
"legendFormat": "User",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "avg(rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"system\"}[5m])) * 100",
|
||||
"legendFormat": "System",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "avg(rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"iowait\"}[5m])) * 100",
|
||||
"legendFormat": "IOWait",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "avg(rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"steal\"}[5m])) * 100",
|
||||
"legendFormat": "Steal",
|
||||
"refId": "D"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Per Core",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 6
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"min": 0
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"idle\"}[5m]) * 100)",
|
||||
"legendFormat": "CPU {{cpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"type": "row",
|
||||
"title": "\ud83e\udde0 Memory Details",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 14
|
||||
},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 21,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 15
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"custom": {
|
||||
"fillOpacity": 30,
|
||||
"stacking": {
|
||||
"mode": "normal",
|
||||
"group": "A"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_memory_MemTotal_bytes{job=\"$job\",instance=\"$instance\"} - node_memory_MemAvailable_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Used",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_Buffers_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Buffers",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_Cached_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Cached",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_MemFree_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Free",
|
||||
"refId": "D"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
"type": "timeseries",
|
||||
"title": "Swap Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 15
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_memory_SwapTotal_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Total",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_SwapTotal_bytes{job=\"$job\",instance=\"$instance\"} - node_memory_SwapFree_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Used",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"type": "row",
|
||||
"title": "\ud83d\udcbe Disk Details",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 23
|
||||
},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 31,
|
||||
"type": "bargauge",
|
||||
"title": "Disk Space Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 70
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 85
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"displayMode": "gradient",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - ((node_filesystem_avail_bytes{job=\"$job\",instance=\"$instance\",fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{job=\"$job\",instance=\"$instance\",fstype!~\"tmpfs|overlay|squashfs\"}) * 100)",
|
||||
"legendFormat": "{{mountpoint}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 32,
|
||||
"type": "timeseries",
|
||||
"title": "Disk I/O",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 24
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byRegexp",
|
||||
"options": ".*Write.*"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "custom.transform",
|
||||
"value": "negative-Y"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_disk_read_bytes_total{job=\"$job\",instance=\"$instance\",device!~\"loop.*|dm-.*\"}[5m])",
|
||||
"legendFormat": "{{device}} Read",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_disk_written_bytes_total{job=\"$job\",instance=\"$instance\",device!~\"loop.*|dm-.*\"}[5m])",
|
||||
"legendFormat": "{{device}} Write",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 40,
|
||||
"type": "row",
|
||||
"title": "\ud83c\udf10 Network Details",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 32
|
||||
},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 41,
|
||||
"type": "timeseries",
|
||||
"title": "Network Traffic",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 33
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bps"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byRegexp",
|
||||
"options": ".*TX.*"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "custom.transform",
|
||||
"value": "negative-Y"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_receive_bytes_total{job=\"$job\",instance=\"$instance\",device!~\"lo|docker.*|br-.*|veth.*\"}[5m]) * 8",
|
||||
"legendFormat": "{{device}} RX",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_network_transmit_bytes_total{job=\"$job\",instance=\"$instance\",device!~\"lo|docker.*|br-.*|veth.*\"}[5m]) * 8",
|
||||
"legendFormat": "{{device}} TX",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 42,
|
||||
"type": "timeseries",
|
||||
"title": "Network Errors",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 33
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_receive_errs_total{job=\"$job\",instance=\"$instance\",device!~\"lo|docker.*|br-.*|veth.*\"}[5m])",
|
||||
"legendFormat": "{{device}} RX Errors",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_network_transmit_errs_total{job=\"$job\",instance=\"$instance\",device!~\"lo|docker.*|br-.*|veth.*\"}[5m])",
|
||||
"legendFormat": "{{device}} TX Errors",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"id": null
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,351 @@
|
||||
{
|
||||
"uid": "synology-dashboard-v2",
|
||||
"title": "Synology NAS Monitoring",
|
||||
"tags": [
|
||||
"synology",
|
||||
"nas",
|
||||
"snmp"
|
||||
],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 38,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Data Source",
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": "",
|
||||
"current": {},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"definition": "label_values(ssCpuRawIdle, job)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "NAS",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values(ssCpuRawIdle, job)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "NAS Status",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": {
|
||||
"text": "DOWN",
|
||||
"color": "red"
|
||||
},
|
||||
"1": {
|
||||
"text": "UP",
|
||||
"color": "green"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"textMode": "value_and_name",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{job=~\"$job\"}",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "gauge",
|
||||
"title": "CPU Usage",
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 4
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 60
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - ((ssCpuRawIdle{job=~\"$job\"} / (ssCpuRawUser{job=~\"$job\"} + ssCpuRawSystem{job=~\"$job\"} + ssCpuRawIdle{job=~\"$job\"} + ssCpuRawWait{job=~\"$job\"})) * 100)",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "gauge",
|
||||
"title": "Memory Usage",
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 4
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 70
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 90
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((memTotalReal{job=~\"$job\"} - memAvailReal{job=~\"$job\"}) / memTotalReal{job=~\"$job\"}) * 100",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Total Memory",
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 4
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "decbytes",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "blue",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "memTotalReal{job=~\"$job\"} * 1024",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "timeseries",
|
||||
"title": "Load Average",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "laLoad{job=~\"$job\", laIndex=\"1\"}",
|
||||
"legendFormat": "{{job}} 1m",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "laLoad{job=~\"$job\", laIndex=\"2\"}",
|
||||
"legendFormat": "{{job}} 5m",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "laLoad{job=~\"$job\", laIndex=\"3\"}",
|
||||
"legendFormat": "{{job}} 15m",
|
||||
"refId": "C"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "Uptime",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 10
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sysUpTime{job=~\"$job\"} / 100",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'Homelab Dashboards'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
folderUid: ''
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 30
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /etc/grafana/dashboards
|
||||
@@ -0,0 +1,9 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
@@ -0,0 +1,98 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ["prometheus:9090"]
|
||||
|
||||
- job_name: "homelab-node"
|
||||
static_configs:
|
||||
- targets: ["100.67.40.126:9100"]
|
||||
|
||||
- job_name: "raspberry-pis"
|
||||
static_configs:
|
||||
- targets: ["100.77.151.40:9100"] # pi-5
|
||||
- targets: ["100.123.246.75:9100"] # pi-5-kevin
|
||||
|
||||
- job_name: "setillo-node"
|
||||
static_configs:
|
||||
- targets: ["100.125.0.20:9100"]
|
||||
|
||||
- job_name: "setillo-snmp"
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [synology]
|
||||
auth: [snmpv3]
|
||||
target: ["127.0.0.1"]
|
||||
static_configs:
|
||||
- targets: ["100.125.0.20:9116"]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
replacement: "127.0.0.1"
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
replacement: "100.125.0.20"
|
||||
- target_label: __address__
|
||||
replacement: "100.125.0.20:9116"
|
||||
|
||||
- job_name: "calypso-node"
|
||||
static_configs:
|
||||
- targets: ["100.103.48.78:9100"]
|
||||
|
||||
- job_name: "calypso-snmp"
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [synology]
|
||||
auth: [snmpv3]
|
||||
target: ["127.0.0.1"]
|
||||
static_configs:
|
||||
- targets: ["100.103.48.78:9116"]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
replacement: "127.0.0.1"
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
replacement: "100.103.48.78"
|
||||
- target_label: __address__
|
||||
replacement: "100.103.48.78:9116"
|
||||
|
||||
- job_name: "atlantis-node"
|
||||
static_configs:
|
||||
- targets: ["100.83.230.112:9100"]
|
||||
|
||||
- job_name: "atlantis-snmp"
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [synology]
|
||||
auth: [snmpv3]
|
||||
target: ["127.0.0.1"]
|
||||
static_configs:
|
||||
- targets: ["100.83.230.112:9116"]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
replacement: "127.0.0.1"
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
replacement: "100.83.230.112"
|
||||
- target_label: __address__
|
||||
replacement: "100.83.230.112:9116"
|
||||
|
||||
- job_name: "concord-nuc-node"
|
||||
static_configs:
|
||||
- targets: ["100.72.55.21:9100"]
|
||||
|
||||
- job_name: "truenas-node"
|
||||
static_configs:
|
||||
- targets: ["100.75.252.64:9100"]
|
||||
|
||||
- job_name: "vmi2076105-node"
|
||||
static_configs:
|
||||
- targets: ["100.99.156.20:9100"]
|
||||
|
||||
- job_name: "proxmox-node"
|
||||
static_configs:
|
||||
- targets: ["100.87.12.28:9100"]
|
||||
@@ -0,0 +1,11 @@
|
||||
FROM golang:1.23 AS build
|
||||
|
||||
WORKDIR /app
|
||||
RUN git clone https://github.com/kradalby/truenas_exporter.git .
|
||||
RUN go build -o truenas_exporter .
|
||||
|
||||
FROM debian:stable-slim
|
||||
WORKDIR /root/
|
||||
COPY --from=build /app/truenas_exporter .
|
||||
EXPOSE 9163
|
||||
ENTRYPOINT ["./truenas_exporter"]
|
||||
@@ -0,0 +1,83 @@
|
||||
# Prometheus & Grafana Monitoring Hub
|
||||
|
||||
This folder contains the configuration for the centralized monitoring stack running on the Homelab VM.
|
||||
|
||||
## Folder Structure
|
||||
|
||||
```
|
||||
prometheus_grafana_hub/
|
||||
├── dashboards/ # Grafana dashboard JSON files
|
||||
│ ├── infrastructure-overview.json # Fleet-wide status of all devices
|
||||
│ ├── node-details.json # Detailed per-host metrics
|
||||
│ ├── synology-monitoring.json # Synology NAS SNMP metrics
|
||||
│ └── node-exporter.json # Full Node Exporter dashboard
|
||||
├── snmp-configs/ # SNMP Exporter configurations
|
||||
│ └── snmp_synology.yml # Synology NAS SNMP config
|
||||
├── docker-compose/ # Docker compose files for remote hosts
|
||||
│ ├── atlantis-docker-compose.yml
|
||||
│ ├── calypso-docker-compose.yml
|
||||
│ ├── setillo-docker-compose.yml
|
||||
│ ├── concord-nuc-docker-compose.yml
|
||||
│ └── guava-docker-compose-node-exporter.yml
|
||||
├── docker-compose.homelab-vm.yml # Main stack compose (Homelab VM)
|
||||
├── prometheus.yml # Prometheus scrape configuration
|
||||
├── Dockerfile # Custom Prometheus image (if needed)
|
||||
└── README.md
|
||||
```
|
||||
|
||||
## Dashboards
|
||||
|
||||
| Dashboard | UID | Description |
|
||||
|-----------|-----|-------------|
|
||||
| Infrastructure Overview | `infrastructure-overview-v2` | Fleet status, CPU, Memory, Disk, Network for all hosts |
|
||||
| Node Details | `node-details-v2` | Per-REDACTED_APP_PASSWORD CPU breakdown, per-core usage, memory details, disk I/O |
|
||||
| Synology Monitoring | `synology-dashboard-v2` | Synology NAS CPU, Memory, Load, Uptime via SNMP |
|
||||
| Node Exporter Full | `rYdddlPWk` | Comprehensive node exporter metrics |
|
||||
|
||||
## SNMP Configuration
|
||||
|
||||
The `snmp_synology.yml` config is deployed to each Synology NAS at:
|
||||
- **Atlantis**: `/volume2/metadata/docker/snmp/snmp.yml`
|
||||
- **Calypso**: `/volume1/docker/snmp/snmp.yml`
|
||||
- **Setillo**: `/volume1/docker/snmp/snmp.yml`
|
||||
|
||||
## Monitored Hosts
|
||||
|
||||
### Node Exporter Targets
|
||||
- homelab-node (100.67.40.126:9100)
|
||||
- atlantis-node (100.83.230.112:9100)
|
||||
- calypso-node (100.103.48.78:9100)
|
||||
- setillo-node (100.125.0.20:9100)
|
||||
- concord-nuc-node (100.72.55.21:9100)
|
||||
- proxmox-node (100.87.12.28:9100)
|
||||
- truenas-node (100.75.252.64:9100)
|
||||
- raspberry-pis (100.77.151.40:9100)
|
||||
|
||||
### SNMP Targets (Synology)
|
||||
- atlantis-snmp (100.83.230.112)
|
||||
- calypso-snmp (100.103.48.78)
|
||||
- setillo-snmp (100.125.0.20)
|
||||
|
||||
## Deployment
|
||||
|
||||
### Homelab VM (Main Stack)
|
||||
|
||||
The main monitoring stack runs on Homelab VM:
|
||||
```bash
|
||||
cd ~/docker/monitoring
|
||||
|
||||
# Using the compose file from this repo:
|
||||
docker-compose -f docker-compose.homelab-vm.yml up -d
|
||||
|
||||
# Or if already deployed:
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
**Services:**
|
||||
- **Grafana**: http://homelab:3300 (admin / set via GF_SECURITY_ADMIN_PASSWORD)
|
||||
- **Prometheus**: http://homelab:9090
|
||||
- **Node Exporter**: Runs in host network mode on port 9100
|
||||
|
||||
### Remote Hosts
|
||||
|
||||
Each remote host runs node-exporter and/or snmp-exporter as specified in the `docker-compose/` folder.
|
||||
@@ -0,0 +1,135 @@
|
||||
# Homelab Alerting Stack
|
||||
|
||||
This adds Prometheus Alertmanager with notifications to both **ntfy** and **Signal**.
|
||||
|
||||
## Components
|
||||
|
||||
| Component | Purpose | Port |
|
||||
|-----------|---------|------|
|
||||
| Alertmanager | Routes alerts based on severity | 9093 |
|
||||
| Signal Bridge | Forwards critical alerts to Signal | 5000 |
|
||||
|
||||
## Alert Routing
|
||||
|
||||
- **Warning alerts** → ntfy only (`homelab-alerts` topic)
|
||||
- **Critical alerts** → Both ntfy AND Signal
|
||||
|
||||
## Deployment Steps
|
||||
|
||||
### 1. Update your phone number
|
||||
|
||||
Edit `docker-compose.alerting.yml` and replace `REPLACE_WITH_YOUR_NUMBER`:
|
||||
|
||||
```yaml
|
||||
environment:
|
||||
- SIGNAL_SENDER=+REDACTED_PHONE_NUMBER # Your Signal number
|
||||
- SIGNAL_RECIPIENTS=+REDACTED_PHONE_NUMBER # Where to send alerts
|
||||
```
|
||||
|
||||
### 2. Copy files to Homelab VM
|
||||
|
||||
```bash
|
||||
# On your local machine or wherever you have SSH access
|
||||
scp -r alerting-configs/* homelab@192.168.0.210:~/docker/monitoring/
|
||||
```
|
||||
|
||||
### 3. Update Prometheus config
|
||||
|
||||
Replace the existing `prometheus.yml` with `prometheus-updated.yml`:
|
||||
|
||||
```bash
|
||||
cd ~/docker/monitoring
|
||||
cp prometheus-updated.yml prometheus/prometheus.yml
|
||||
cp alert-rules.yml prometheus/alert-rules.yml
|
||||
```
|
||||
|
||||
### 4. Create alertmanager directory
|
||||
|
||||
```bash
|
||||
mkdir -p alertmanager
|
||||
cp alertmanager.yml alertmanager/
|
||||
```
|
||||
|
||||
### 5. Deploy the alerting stack
|
||||
|
||||
```bash
|
||||
# Build and start alertmanager + signal bridge
|
||||
docker-compose -f docker-compose.alerting.yml up -d --build
|
||||
|
||||
# Reload Prometheus to pick up new config
|
||||
curl -X POST http://localhost:9090/-/reload
|
||||
```
|
||||
|
||||
### 6. Verify deployment
|
||||
|
||||
```bash
|
||||
# Check Alertmanager is running
|
||||
curl http://localhost:9093/-/healthy
|
||||
|
||||
# Check Signal Bridge is running
|
||||
curl http://localhost:5000/health
|
||||
|
||||
# Send test alert to Signal
|
||||
curl -X POST http://localhost:5000/test \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"message": "🧪 Test alert from Homelab!"}'
|
||||
|
||||
# Send test notification to ntfy
|
||||
curl -d "Test alert from Alertmanager setup" https://ntfy.vish.gg/REDACTED_NTFY_TOPIC
|
||||
```
|
||||
|
||||
## Alert Rules Included
|
||||
|
||||
| Alert | Severity | Trigger |
|
||||
|-------|----------|---------|
|
||||
| HostDown | Critical | Host unreachable for 2 min |
|
||||
| REDACTED_APP_PASSWORD | Warning | CPU > 80% for 5 min |
|
||||
| HostCriticalCpuUsage | Critical | CPU > 95% for 5 min |
|
||||
| HostHighMemoryUsage | Warning | Memory > 85% for 5 min |
|
||||
| HostCriticalMemoryUsage | Critical | Memory > 95% for 5 min |
|
||||
| HostOutOfMemory | Critical | Memory < 5% available |
|
||||
| HostHighDiskUsage | Warning | Disk > 80% full |
|
||||
| HostCriticalDiskUsage | Critical | Disk > 90% full |
|
||||
| HostDiskWillFillIn24Hours | Warning | Predicted to fill in 24h |
|
||||
| REDACTED_APP_PASSWORD | Critical | Filesystem became read-only |
|
||||
| HostNetworkErrors | Warning | Network errors detected |
|
||||
| HostClockSkew | Warning | Time drift > 0.5 seconds |
|
||||
|
||||
## Receiving Alerts
|
||||
|
||||
### ntfy App
|
||||
1. Install ntfy app on your phone (iOS/Android)
|
||||
2. Add server: `https://ntfy.vish.gg`
|
||||
3. Subscribe to topic: `homelab-alerts`
|
||||
|
||||
### Signal
|
||||
- Alerts will arrive as regular Signal messages from your registered number
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Check Alertmanager status
|
||||
```bash
|
||||
docker logs alertmanager
|
||||
curl http://localhost:9093/api/v2/status
|
||||
```
|
||||
|
||||
### Check active alerts
|
||||
```bash
|
||||
curl http://localhost:9093/api/v2/alerts
|
||||
```
|
||||
|
||||
### Check Signal Bridge logs
|
||||
```bash
|
||||
docker logs signal-bridge
|
||||
```
|
||||
|
||||
### Manually trigger test alert in Prometheus
|
||||
Add this rule temporarily to test:
|
||||
```yaml
|
||||
- alert: TestAlert
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Test alert"
|
||||
```
|
||||
@@ -0,0 +1,146 @@
|
||||
# Prometheus Alerting Rules for Homelab Infrastructure
|
||||
|
||||
groups:
|
||||
- name: host-availability
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HostDown
|
||||
expr: up{job=~".*-node"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Host {{ $labels.instance }} is down"
|
||||
description: "Host {{ $labels.instance }} has been unreachable for more than 2 minutes."
|
||||
|
||||
- alert: HostHighLoadAverage
|
||||
expr: node_load15 / count without(cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High load average on {{ $labels.instance }}"
|
||||
description: "15-minute load average is {{ $value | printf \"%.2f\" }} on {{ $labels.instance }}."
|
||||
|
||||
- name: cpu-alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: REDACTED_APP_PASSWORD
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage on {{ $labels.instance }}"
|
||||
description: "CPU usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}."
|
||||
|
||||
- alert: HostCriticalCpuUsage
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "🔥 CRITICAL CPU on {{ $labels.instance }}"
|
||||
description: "CPU usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}. Immediate attention required!"
|
||||
|
||||
- name: memory-alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HostHighMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage on {{ $labels.instance }}"
|
||||
description: "Memory usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}."
|
||||
|
||||
- alert: HostCriticalMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "🔥 CRITICAL Memory on {{ $labels.instance }}"
|
||||
description: "Memory usage is {{ $value | printf \"%.1f\" }}% on {{ $labels.instance }}."
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "💀 OUT OF MEMORY on {{ $labels.instance }}"
|
||||
description: "Only {{ $value | printf \"%.1f\" }}% memory available on {{ $labels.instance }}."
|
||||
|
||||
- name: disk-alerts
|
||||
interval: 60s
|
||||
rules:
|
||||
- alert: HostHighDiskUsage
|
||||
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk space warning on {{ $labels.instance }}"
|
||||
description: "Disk {{ $labels.mountpoint }} is {{ $value | printf \"%.1f\" }}% full on {{ $labels.instance }}."
|
||||
|
||||
- alert: HostCriticalDiskUsage
|
||||
expr: (1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"})) * 100 > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "🔥 CRITICAL Disk space on {{ $labels.instance }}"
|
||||
description: "Disk {{ $labels.mountpoint }} is {{ $value | printf \"%.1f\" }}% full on {{ $labels.instance }}."
|
||||
|
||||
- alert: HostDiskWillFillIn24Hours
|
||||
expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"}[6h], 24*60*60) < 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk {{ $labels.mountpoint }} will fill within 24 hours"
|
||||
description: "Based on current growth rate, disk on {{ $labels.instance }} will be full within 24 hours."
|
||||
|
||||
- alert: REDACTED_APP_PASSWORD
|
||||
expr: node_filesystem_readonly{fstype!~"tmpfs|overlay"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "🔥 Filesystem is read-only on {{ $labels.instance }}"
|
||||
description: "Filesystem {{ $labels.mountpoint }} has become read-only. This usually indicates disk failure!"
|
||||
|
||||
- name: network-alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr: rate(node_network_receive_errs_total{device!~"lo|veth.*|docker.*|br-.*"}[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Network receive errors on {{ $labels.instance }}"
|
||||
description: "{{ $labels.device }} has {{ $value | printf \"%.0f\" }} receive errors/sec."
|
||||
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr: rate(node_network_transmit_errs_total{device!~"lo|veth.*|docker.*|br-.*"}[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Network transmit errors on {{ $labels.instance }}"
|
||||
description: "{{ $labels.device }} has {{ $value | printf \"%.0f\" }} transmit errors/sec."
|
||||
|
||||
- name: system-alerts
|
||||
interval: 60s
|
||||
rules:
|
||||
- alert: HostClockSkew
|
||||
expr: abs(node_timex_offset_seconds) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Clock skew detected on {{ $labels.instance }}"
|
||||
description: "Clock is off by {{ $value | printf \"%.2f\" }} seconds."
|
||||
@@ -0,0 +1,58 @@
|
||||
# Alertmanager Configuration for Homelab
|
||||
# Routes alerts to both ntfy and Signal
|
||||
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
group_by: ['alertname', 'severity', 'instance']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
receiver: 'ntfy-all'
|
||||
|
||||
routes:
|
||||
# Critical alerts go to both Signal AND ntfy
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'critical-alerts'
|
||||
continue: false
|
||||
|
||||
# Warning alerts go to ntfy only
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'ntfy-all'
|
||||
|
||||
receivers:
|
||||
# ntfy receiver for all alerts
|
||||
- name: 'ntfy-all'
|
||||
webhook_configs:
|
||||
- url: 'http://NTFY:80/homelab-alerts'
|
||||
send_resolved: true
|
||||
http_config:
|
||||
follow_redirects: true
|
||||
max_alerts: 10
|
||||
|
||||
# Critical alerts: Signal + ntfy
|
||||
- name: 'critical-alerts'
|
||||
webhook_configs:
|
||||
# ntfy for critical
|
||||
- url: 'http://NTFY:80/homelab-alerts'
|
||||
send_resolved: true
|
||||
http_config:
|
||||
follow_redirects: true
|
||||
max_alerts: 5
|
||||
|
||||
# Signal via bridge service
|
||||
- url: 'http://signal-bridge:5000/alert'
|
||||
send_resolved: true
|
||||
http_config:
|
||||
follow_redirects: true
|
||||
max_alerts: 3
|
||||
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'instance']
|
||||
@@ -0,0 +1,49 @@
|
||||
# Alertmanager Configuration for Homelab
|
||||
# Routes alerts to both ntfy (via bridge) and Signal
|
||||
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
group_by: ['alertname', 'severity', 'instance']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
receiver: 'ntfy-all'
|
||||
|
||||
routes:
|
||||
# Critical alerts go to both Signal AND ntfy
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'critical-alerts'
|
||||
continue: false
|
||||
|
||||
# Warning alerts go to ntfy only
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'ntfy-all'
|
||||
|
||||
receivers:
|
||||
# ntfy receiver for all alerts (via bridge for nice formatting)
|
||||
- name: 'ntfy-all'
|
||||
webhook_configs:
|
||||
- url: 'http://ntfy-bridge:5001/alert'
|
||||
send_resolved: true
|
||||
|
||||
# Critical alerts: Signal + ntfy
|
||||
- name: 'critical-alerts'
|
||||
webhook_configs:
|
||||
# ntfy via bridge (formatted nicely)
|
||||
- url: 'http://ntfy-bridge:5001/alert'
|
||||
send_resolved: true
|
||||
|
||||
# Signal via bridge service
|
||||
- url: 'http://signal-bridge:5000/alert'
|
||||
send_resolved: true
|
||||
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'instance']
|
||||
@@ -0,0 +1,68 @@
|
||||
# Alerting Stack for Homelab
|
||||
|
||||
services:
|
||||
alertmanager:
|
||||
image: prom/alertmanager:latest
|
||||
container_name: alertmanager
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "9093:9093"
|
||||
volumes:
|
||||
- ./alertmanager:/etc/alertmanager
|
||||
- alertmanager-data:/alertmanager
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
- '--web.external-url=http://localhost:9093'
|
||||
networks:
|
||||
- monitoring-stack_default
|
||||
- signal-api-stack_default
|
||||
- ntfy-stack_default
|
||||
|
||||
signal-bridge:
|
||||
build: ./signal-bridge
|
||||
container_name: signal-bridge
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "5000:5000"
|
||||
environment:
|
||||
- SIGNAL_API_URL=http://signal-api:8080
|
||||
- SIGNAL_SENDER=+REDACTED_PHONE_NUMBER
|
||||
- SIGNAL_RECIPIENTS=+REDACTED_PHONE_NUMBER
|
||||
networks:
|
||||
- monitoring-stack_default
|
||||
- signal-api-stack_default
|
||||
healthcheck:
|
||||
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5000/health')"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
ntfy-bridge:
|
||||
build: ./ntfy-bridge
|
||||
container_name: ntfy-bridge
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "5001:5001"
|
||||
environment:
|
||||
- NTFY_URL=http://NTFY:80
|
||||
- NTFY_TOPIC="REDACTED_NTFY_TOPIC"
|
||||
networks:
|
||||
- monitoring-stack_default
|
||||
- ntfy-stack_default
|
||||
healthcheck:
|
||||
test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:5001/health')"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
volumes:
|
||||
alertmanager-data:
|
||||
|
||||
networks:
|
||||
monitoring-stack_default:
|
||||
external: true
|
||||
signal-api-stack_default:
|
||||
external: true
|
||||
ntfy-stack_default:
|
||||
external: true
|
||||
@@ -0,0 +1,5 @@
|
||||
FROM python:3.11-slim
|
||||
WORKDIR /app
|
||||
RUN pip install --no-cache-dir flask requests gunicorn
|
||||
COPY app.py .
|
||||
CMD ["gunicorn", "--bind", "0.0.0.0:5001", "--workers", "2", "app:app"]
|
||||
@@ -0,0 +1,104 @@
|
||||
from flask import Flask, request, jsonify
|
||||
import requests
|
||||
import os
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
NTFY_URL = os.environ.get('NTFY_URL', 'http://NTFY:80')
|
||||
NTFY_TOPIC = os.environ.get('NTFY_TOPIC', 'homelab-alerts')
|
||||
|
||||
def get_status_icon(severity, status):
|
||||
if status == 'resolved':
|
||||
return 'white_check_mark'
|
||||
if severity == 'critical':
|
||||
return 'rotating_light'
|
||||
return 'warning'
|
||||
|
||||
def get_priority(severity, status):
|
||||
if status == 'resolved':
|
||||
return '3'
|
||||
if severity == 'critical':
|
||||
return '5'
|
||||
return '4'
|
||||
|
||||
def format_alert(alert):
|
||||
status = alert.get('status', 'firing')
|
||||
labels = alert.get('labels', {})
|
||||
annotations = alert.get('annotations', {})
|
||||
|
||||
alertname = labels.get('alertname', 'Unknown Alert')
|
||||
severity = labels.get('severity', 'warning')
|
||||
instance = labels.get('instance', 'unknown')
|
||||
|
||||
status_text = 'RESOLVED' if status == 'resolved' else 'FIRING'
|
||||
title = f"{alertname} [{status_text}]"
|
||||
|
||||
summary = annotations.get('summary', '')
|
||||
description = annotations.get('description', '')
|
||||
|
||||
body_parts = []
|
||||
if summary:
|
||||
body_parts.append(summary)
|
||||
if description and description != summary:
|
||||
body_parts.append(description)
|
||||
if instance and instance != 'unknown':
|
||||
body_parts.append(f"Host: {instance}")
|
||||
|
||||
body = '\n'.join(body_parts) if body_parts else f"Alert {status_text.lower()} on {instance}"
|
||||
|
||||
return title, body, severity, status
|
||||
|
||||
@app.route('/alert', methods=['POST'])
|
||||
def handle_alert():
|
||||
try:
|
||||
data = request.json
|
||||
alerts = data.get('alerts', [])
|
||||
|
||||
for alert in alerts:
|
||||
title, body, severity, status = format_alert(alert)
|
||||
priority = get_priority(severity, status)
|
||||
tag = get_status_icon(severity, status)
|
||||
|
||||
response = requests.post(
|
||||
f"{NTFY_URL}/{NTFY_TOPIC}",
|
||||
data=body,
|
||||
headers={
|
||||
'Title': title,
|
||||
'Priority': priority,
|
||||
'Tags': tag
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code not in [200, 201]:
|
||||
print(f"Failed to send to ntfy: {response.status_code} - {response.text}")
|
||||
|
||||
return jsonify({'status': 'sent', 'count': len(alerts)})
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
return jsonify({'status': 'error', 'message': str(e)}), 500
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
def health():
|
||||
return jsonify({'status': 'healthy'})
|
||||
|
||||
@app.route('/test', methods=['POST'])
|
||||
def test():
|
||||
try:
|
||||
data = request.json or {}
|
||||
message = data.get('message', 'Test notification from ntfy-bridge')
|
||||
|
||||
response = requests.post(
|
||||
f"{NTFY_URL}/{NTFY_TOPIC}",
|
||||
data=message,
|
||||
headers={
|
||||
'Title': 'Test Alert',
|
||||
'Priority': '4',
|
||||
'Tags': 'test_tube'
|
||||
}
|
||||
)
|
||||
return jsonify({'status': 'sent'})
|
||||
except Exception as e:
|
||||
return jsonify({'status': 'error', 'message': str(e)}), 500
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=5001)
|
||||
@@ -0,0 +1,117 @@
|
||||
# Updated Prometheus Configuration with Alertmanager
|
||||
# This adds alerting configuration to your existing prometheus.yml
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s # How often to evaluate rules
|
||||
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
# Load alerting rules
|
||||
rule_files:
|
||||
- /etc/prometheus/alert-rules.yml
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ["prometheus:9090"]
|
||||
|
||||
- job_name: "alertmanager"
|
||||
static_configs:
|
||||
- targets: ["alertmanager:9093"]
|
||||
|
||||
- job_name: "homelab-node"
|
||||
static_configs:
|
||||
- targets: ["100.67.40.126:9100"]
|
||||
|
||||
- job_name: "raspberry-pis"
|
||||
static_configs:
|
||||
- targets: ["100.77.151.40:9100"] # pi-5
|
||||
- targets: ["100.123.246.75:9100"] # pi-5-kevin
|
||||
|
||||
- job_name: "setillo-node"
|
||||
static_configs:
|
||||
- targets: ["100.125.0.20:9100"]
|
||||
|
||||
- job_name: "setillo-snmp"
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [synology]
|
||||
auth: [snmpv3]
|
||||
target: ["127.0.0.1"]
|
||||
static_configs:
|
||||
- targets: ["100.125.0.20:9116"]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
replacement: "127.0.0.1"
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
replacement: "100.125.0.20"
|
||||
- target_label: __address__
|
||||
replacement: "100.125.0.20:9116"
|
||||
|
||||
- job_name: "calypso-node"
|
||||
static_configs:
|
||||
- targets: ["100.103.48.78:9100"]
|
||||
|
||||
- job_name: "calypso-snmp"
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [synology]
|
||||
auth: [snmpv3]
|
||||
target: ["127.0.0.1"]
|
||||
static_configs:
|
||||
- targets: ["100.103.48.78:9116"]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
replacement: "127.0.0.1"
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
replacement: "100.103.48.78"
|
||||
- target_label: __address__
|
||||
replacement: "100.103.48.78:9116"
|
||||
|
||||
- job_name: "atlantis-node"
|
||||
static_configs:
|
||||
- targets: ["100.83.230.112:9100"]
|
||||
|
||||
- job_name: "atlantis-snmp"
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [synology]
|
||||
auth: [snmpv3]
|
||||
target: ["127.0.0.1"]
|
||||
static_configs:
|
||||
- targets: ["100.83.230.112:9116"]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
replacement: "127.0.0.1"
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
replacement: "100.83.230.112"
|
||||
- target_label: __address__
|
||||
replacement: "100.83.230.112:9116"
|
||||
|
||||
- job_name: "concord-nuc-node"
|
||||
static_configs:
|
||||
- targets: ["100.72.55.21:9100"]
|
||||
|
||||
- job_name: "truenas-node"
|
||||
static_configs:
|
||||
- targets: ["100.75.252.64:9100"]
|
||||
|
||||
- job_name: "vmi2076105-node"
|
||||
static_configs:
|
||||
- targets: ["100.99.156.20:9100"]
|
||||
|
||||
- job_name: "proxmox-node"
|
||||
static_configs:
|
||||
- targets: ["100.87.12.28:9100"]
|
||||
@@ -0,0 +1,11 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN pip install --no-cache-dir flask requests gunicorn
|
||||
|
||||
COPY app.py .
|
||||
|
||||
EXPOSE 5000
|
||||
|
||||
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "2", "--timeout", "60", "app:app"]
|
||||
@@ -0,0 +1,130 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Signal Bridge for Alertmanager
|
||||
Receives webhooks from Alertmanager and forwards to Signal API
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
from flask import Flask, request, jsonify
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Configuration from environment variables
|
||||
SIGNAL_API_URL = os.environ.get('SIGNAL_API_URL', 'http://signal-api:8080')
|
||||
SIGNAL_SENDER = os.environ.get('SIGNAL_SENDER', '') # Your Signal number
|
||||
SIGNAL_RECIPIENTS = os.environ.get('SIGNAL_RECIPIENTS', '').split(',') # Comma-separated
|
||||
|
||||
def format_alert_message(alert_data):
|
||||
"""Format Alertmanager webhook payload into a readable message"""
|
||||
messages = []
|
||||
|
||||
status = alert_data.get('status', 'unknown')
|
||||
|
||||
for alert in alert_data.get('alerts', []):
|
||||
alert_status = alert.get('status', status)
|
||||
labels = alert.get('labels', {})
|
||||
annotations = alert.get('annotations', {})
|
||||
|
||||
severity = labels.get('severity', 'unknown')
|
||||
alertname = labels.get('alertname', 'Unknown Alert')
|
||||
instance = labels.get('instance', 'unknown')
|
||||
|
||||
summary = annotations.get('summary', alertname)
|
||||
description = annotations.get('description', '')
|
||||
|
||||
# Status emoji
|
||||
if alert_status == 'resolved':
|
||||
status_emoji = '✅'
|
||||
status_text = 'RESOLVED'
|
||||
elif severity == 'critical':
|
||||
status_emoji = '🚨'
|
||||
status_text = 'CRITICAL'
|
||||
else:
|
||||
status_emoji = '⚠️'
|
||||
status_text = 'WARNING'
|
||||
|
||||
msg = f"{status_emoji} [{status_text}] {summary}"
|
||||
if description:
|
||||
msg += f"\n{description}"
|
||||
|
||||
messages.append(msg)
|
||||
|
||||
return "\n\n".join(messages)
|
||||
|
||||
def send_signal_message(message):
|
||||
"""Send message via Signal API"""
|
||||
if not SIGNAL_SENDER or not SIGNAL_RECIPIENTS:
|
||||
app.logger.error("Signal sender or recipients not configured")
|
||||
return False
|
||||
|
||||
success = True
|
||||
for recipient in SIGNAL_RECIPIENTS:
|
||||
recipient = recipient.strip()
|
||||
if not recipient:
|
||||
continue
|
||||
|
||||
try:
|
||||
payload = {
|
||||
"message": message,
|
||||
"number": SIGNAL_SENDER,
|
||||
"recipients": [recipient]
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f"{SIGNAL_API_URL}/v2/send",
|
||||
json=payload,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if response.status_code in [200, 201]:
|
||||
app.logger.info(f"Message sent to {recipient}")
|
||||
else:
|
||||
app.logger.error(f"Failed to send to {recipient}: {response.status_code} - {response.text}")
|
||||
success = False
|
||||
|
||||
except Exception as e:
|
||||
app.logger.error(f"Error sending to {recipient}: {e}")
|
||||
success = False
|
||||
|
||||
return success
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
def health():
|
||||
return jsonify({"status": "healthy"}), 200
|
||||
|
||||
@app.route('/alert', methods=['POST'])
|
||||
def receive_alert():
|
||||
"""Receive alert from Alertmanager and forward to Signal"""
|
||||
try:
|
||||
alert_data = request.get_json()
|
||||
|
||||
if not alert_data:
|
||||
return jsonify({"error": "No data received"}), 400
|
||||
|
||||
app.logger.info(f"Received alert: {json.dumps(alert_data, indent=2)}")
|
||||
|
||||
message = format_alert_message(alert_data)
|
||||
|
||||
if send_signal_message(message):
|
||||
return jsonify({"status": "sent"}), 200
|
||||
else:
|
||||
return jsonify({"status": "partial_failure"}), 207
|
||||
|
||||
except Exception as e:
|
||||
app.logger.error(f"Error processing alert: {e}")
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
@app.route('/test', methods=['POST'])
|
||||
def test_message():
|
||||
"""Send a test message"""
|
||||
message = request.json.get('message', '🧪 Test alert from Signal Bridge')
|
||||
|
||||
if send_signal_message(message):
|
||||
return jsonify({"status": "sent"}), 200
|
||||
else:
|
||||
return jsonify({"status": "failed"}), 500
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=5000)
|
||||
@@ -0,0 +1,366 @@
|
||||
{
|
||||
"uid": "infrastructure-overview-v2",
|
||||
"title": "Infrastructure Overview - All Devices",
|
||||
"tags": [
|
||||
"infrastructure",
|
||||
"node-exporter",
|
||||
"tailscale"
|
||||
],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 38,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Data Source",
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": "",
|
||||
"current": {},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"definition": "label_values(node_uname_info, job)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Host",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values(node_uname_info, job)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Device Status",
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": {
|
||||
"text": "DOWN",
|
||||
"color": "red"
|
||||
},
|
||||
"1": {
|
||||
"text": "UP",
|
||||
"color": "green"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"textMode": "value_and_name",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{job=~\"$job\"}",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 5
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"min": 0
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (avg by(job) (rate(node_cpu_seconds_total{mode=\"idle\", job=~\"$job\"}[5m])) * 100)",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 5
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"min": 0
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(1 - (node_memory_MemAvailable_bytes{job=~\"$job\"} / node_memory_MemTotal_bytes{job=~\"$job\"})) * 100",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "bargauge",
|
||||
"title": "Root Disk Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 13
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 70
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 85
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"displayMode": "gradient",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - ((node_filesystem_avail_bytes{job=~\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"} / node_filesystem_size_bytes{job=~\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"}) * 100)",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Uptime",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 13
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_time_seconds{job=~\"$job\"} - node_boot_time_seconds{job=~\"$job\"}",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "timeseries",
|
||||
"title": "Network Receive",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 21
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(job) (rate(node_network_receive_bytes_total{job=~\"$job\", device!~\"lo|docker.*|br-.*|veth.*\"}[5m]))",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"type": "timeseries",
|
||||
"title": "Network Transmit",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 21
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(job) (rate(node_network_transmit_bytes_total{job=~\"$job\", device!~\"lo|docker.*|br-.*|veth.*\"}[5m]))",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,936 @@
|
||||
{
|
||||
"uid": "node-details-v2",
|
||||
"title": "Node Details - Full Metrics",
|
||||
"tags": [
|
||||
"node-exporter",
|
||||
"detailed",
|
||||
"infrastructure"
|
||||
],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 38,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": "prometheus",
|
||||
"value": "prometheus"
|
||||
},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Data Source",
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"current": {},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"definition": "label_values(node_uname_info, job)",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Host",
|
||||
"multi": false,
|
||||
"name": "job",
|
||||
"options": [],
|
||||
"query": "label_values(node_uname_info, job)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"current": {},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"definition": "label_values(node_uname_info{job=\"$job\"}, instance)",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Instance",
|
||||
"multi": false,
|
||||
"name": "instance",
|
||||
"options": [],
|
||||
"query": "label_values(node_uname_info{job=\"$job\"}, instance)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "row",
|
||||
"title": "\ud83d\udcca Quick Stats",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Uptime",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 4,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_time_seconds{job=\"$job\",instance=\"$instance\"} - node_boot_time_seconds{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Uptime",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "CPU Cores",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 3,
|
||||
"x": 4,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "blue",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"idle\"})",
|
||||
"legendFormat": "Cores",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Total RAM",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 3,
|
||||
"x": 7,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "purple",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_memory_MemTotal_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "RAM",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "gauge",
|
||||
"title": "CPU",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 3,
|
||||
"x": 10,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 60
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (avg(rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"idle\"}[5m])) * 100)",
|
||||
"legendFormat": "CPU",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "gauge",
|
||||
"title": "Memory",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 3,
|
||||
"x": 13,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 70
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 85
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(1 - (node_memory_MemAvailable_bytes{job=\"$job\",instance=\"$instance\"} / node_memory_MemTotal_bytes{job=\"$job\",instance=\"$instance\"})) * 100",
|
||||
"legendFormat": "Memory",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"type": "gauge",
|
||||
"title": "Disk /",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 3,
|
||||
"x": 16,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 70
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 85
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - ((node_filesystem_avail_bytes{job=\"$job\",instance=\"$instance\",mountpoint=\"/\",fstype!=\"rootfs\"} / node_filesystem_size_bytes{job=\"$job\",instance=\"$instance\",mountpoint=\"/\",fstype!=\"rootfs\"}) * 100)",
|
||||
"legendFormat": "Disk",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "stat",
|
||||
"title": "Load 1m",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 2,
|
||||
"x": 19,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"decimals": 2,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_load1{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "1m",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"type": "stat",
|
||||
"title": "Load 5m",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 2,
|
||||
"x": 21,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"decimals": 2,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_load5{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "5m",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"type": "row",
|
||||
"title": "\ud83d\udda5\ufe0f CPU Details",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 5
|
||||
},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage Breakdown",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 6
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"custom": {
|
||||
"fillOpacity": 50,
|
||||
"stacking": {
|
||||
"mode": "normal",
|
||||
"group": "A"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg(rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"user\"}[5m])) * 100",
|
||||
"legendFormat": "User",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "avg(rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"system\"}[5m])) * 100",
|
||||
"legendFormat": "System",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "avg(rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"iowait\"}[5m])) * 100",
|
||||
"legendFormat": "IOWait",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "avg(rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"steal\"}[5m])) * 100",
|
||||
"legendFormat": "Steal",
|
||||
"refId": "D"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Per Core",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 6
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"min": 0
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"idle\"}[5m]) * 100)",
|
||||
"legendFormat": "CPU {{cpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"type": "row",
|
||||
"title": "\ud83e\udde0 Memory Details",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 14
|
||||
},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 21,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 15
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"custom": {
|
||||
"fillOpacity": 30,
|
||||
"stacking": {
|
||||
"mode": "normal",
|
||||
"group": "A"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_memory_MemTotal_bytes{job=\"$job\",instance=\"$instance\"} - node_memory_MemAvailable_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Used",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_Buffers_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Buffers",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_Cached_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Cached",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_MemFree_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Free",
|
||||
"refId": "D"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
"type": "timeseries",
|
||||
"title": "Swap Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 15
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_memory_SwapTotal_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Total",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_SwapTotal_bytes{job=\"$job\",instance=\"$instance\"} - node_memory_SwapFree_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Used",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"type": "row",
|
||||
"title": "\ud83d\udcbe Disk Details",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 23
|
||||
},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 31,
|
||||
"type": "bargauge",
|
||||
"title": "Disk Space Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 70
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 85
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"displayMode": "gradient",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - ((node_filesystem_avail_bytes{job=\"$job\",instance=\"$instance\",fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{job=\"$job\",instance=\"$instance\",fstype!~\"tmpfs|overlay|squashfs\"}) * 100)",
|
||||
"legendFormat": "{{mountpoint}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 32,
|
||||
"type": "timeseries",
|
||||
"title": "Disk I/O",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 24
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byRegexp",
|
||||
"options": ".*Write.*"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "custom.transform",
|
||||
"value": "negative-Y"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_disk_read_bytes_total{job=\"$job\",instance=\"$instance\",device!~\"loop.*|dm-.*\"}[5m])",
|
||||
"legendFormat": "{{device}} Read",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_disk_written_bytes_total{job=\"$job\",instance=\"$instance\",device!~\"loop.*|dm-.*\"}[5m])",
|
||||
"legendFormat": "{{device}} Write",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 40,
|
||||
"type": "row",
|
||||
"title": "\ud83c\udf10 Network Details",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 32
|
||||
},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 41,
|
||||
"type": "timeseries",
|
||||
"title": "Network Traffic",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 33
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bps"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byRegexp",
|
||||
"options": ".*TX.*"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "custom.transform",
|
||||
"value": "negative-Y"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_receive_bytes_total{job=\"$job\",instance=\"$instance\",device!~\"lo|docker.*|br-.*|veth.*\"}[5m]) * 8",
|
||||
"legendFormat": "{{device}} RX",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_network_transmit_bytes_total{job=\"$job\",instance=\"$instance\",device!~\"lo|docker.*|br-.*|veth.*\"}[5m]) * 8",
|
||||
"legendFormat": "{{device}} TX",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 42,
|
||||
"type": "timeseries",
|
||||
"title": "Network Errors",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 33
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_receive_errs_total{job=\"$job\",instance=\"$instance\",device!~\"lo|docker.*|br-.*|veth.*\"}[5m])",
|
||||
"legendFormat": "{{device}} RX Errors",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_network_transmit_errs_total{job=\"$job\",instance=\"$instance\",device!~\"lo|docker.*|br-.*|veth.*\"}[5m])",
|
||||
"legendFormat": "{{device}} TX Errors",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"id": null
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,351 @@
|
||||
{
|
||||
"uid": "synology-dashboard-v2",
|
||||
"title": "Synology NAS Monitoring",
|
||||
"tags": [
|
||||
"synology",
|
||||
"nas",
|
||||
"snmp"
|
||||
],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 38,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Data Source",
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": "",
|
||||
"current": {},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"definition": "label_values(ssCpuRawIdle, job)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "NAS",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values(ssCpuRawIdle, job)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "NAS Status",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": {
|
||||
"text": "DOWN",
|
||||
"color": "red"
|
||||
},
|
||||
"1": {
|
||||
"text": "UP",
|
||||
"color": "green"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"textMode": "value_and_name",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{job=~\"$job\"}",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "gauge",
|
||||
"title": "CPU Usage",
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 4
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 60
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - ((ssCpuRawIdle{job=~\"$job\"} / (ssCpuRawUser{job=~\"$job\"} + ssCpuRawSystem{job=~\"$job\"} + ssCpuRawIdle{job=~\"$job\"} + ssCpuRawWait{job=~\"$job\"})) * 100)",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "gauge",
|
||||
"title": "Memory Usage",
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 4
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 70
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 90
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((memTotalReal{job=~\"$job\"} - memAvailReal{job=~\"$job\"}) / memTotalReal{job=~\"$job\"}) * 100",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Total Memory",
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 4
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "decbytes",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "blue",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "memTotalReal{job=~\"$job\"} * 1024",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "timeseries",
|
||||
"title": "Load Average",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "laLoad{job=~\"$job\", laIndex=\"1\"}",
|
||||
"legendFormat": "{{job}} 1m",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "laLoad{job=~\"$job\", laIndex=\"2\"}",
|
||||
"legendFormat": "{{job}} 5m",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "laLoad{job=~\"$job\", laIndex=\"3\"}",
|
||||
"legendFormat": "{{job}} 15m",
|
||||
"refId": "C"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "Uptime",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 10
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sysUpTime{job=~\"$job\"} / 100",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
# Prometheus & Grafana Monitoring Stack
|
||||
# Deployed on Homelab VM at ~/docker/monitoring
|
||||
#
|
||||
# Usage:
|
||||
# cd ~/docker/monitoring
|
||||
# docker-compose up -d
|
||||
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: prometheus
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "9090:9090"
|
||||
volumes:
|
||||
- ./prometheus:/etc/prometheus
|
||||
- prometheus-data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--web.enable-lifecycle'
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana-oss:latest
|
||||
container_name: grafana
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "3300:3000"
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
- GF_SECURITY_ADMIN_PASSWORD="REDACTED_PASSWORD"
|
||||
volumes:
|
||||
- grafana-data:/var/lib/grafana
|
||||
depends_on:
|
||||
- prometheus
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
node_exporter:
|
||||
image: prom/node-exporter:latest
|
||||
container_name: node_exporter
|
||||
restart: unless-stopped
|
||||
network_mode: host
|
||||
pid: host
|
||||
user: nobody
|
||||
command:
|
||||
- '--path.rootfs=/host'
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/host:ro,rslave
|
||||
|
||||
volumes:
|
||||
prometheus-data:
|
||||
grafana-data:
|
||||
|
||||
networks:
|
||||
monitoring:
|
||||
driver: bridge
|
||||
@@ -0,0 +1,26 @@
|
||||
version: "3.8"
|
||||
|
||||
services:
|
||||
node-exporter:
|
||||
image: quay.io/prometheus/node-exporter:latest
|
||||
container_name: node_exporter
|
||||
network_mode: host
|
||||
pid: host
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--path.rootfs=/rootfs'
|
||||
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
restart: unless-stopped
|
||||
|
||||
snmp-exporter:
|
||||
image: quay.io/prometheus/snmp-exporter:latest
|
||||
container_name: snmp_exporter
|
||||
network_mode: host # important, so exporter can talk to DSM SNMP on localhost
|
||||
volumes:
|
||||
- /volume1/docker/snmp/snmp.yml:/etc/snmp_exporter/snmp.yml:ro
|
||||
restart: unless-stopped
|
||||
@@ -0,0 +1,26 @@
|
||||
version: "3.8"
|
||||
|
||||
services:
|
||||
node-exporter:
|
||||
image: quay.io/prometheus/node-exporter:latest
|
||||
container_name: node_exporter
|
||||
network_mode: host
|
||||
pid: host
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--path.rootfs=/rootfs'
|
||||
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
restart: unless-stopped
|
||||
|
||||
snmp-exporter:
|
||||
image: quay.io/prometheus/snmp-exporter:latest
|
||||
container_name: snmp_exporter
|
||||
network_mode: host
|
||||
volumes:
|
||||
- /volume1/docker/snmp/snmp.yml:/etc/snmp_exporter/snmp.yml:ro
|
||||
restart: unless-stopped
|
||||
@@ -0,0 +1,18 @@
|
||||
version: "3.8"
|
||||
|
||||
services:
|
||||
node-exporter:
|
||||
image: quay.io/prometheus/node-exporter:latest
|
||||
container_name: node_exporter
|
||||
network_mode: host
|
||||
pid: host
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--path.rootfs=/rootfs'
|
||||
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
restart: unless-stopped
|
||||
@@ -0,0 +1,18 @@
|
||||
version: "3.9"
|
||||
|
||||
services:
|
||||
node-exporter:
|
||||
image: prom/node-exporter:latest
|
||||
container_name: node-exporter
|
||||
restart: unless-stopped
|
||||
network_mode: "host"
|
||||
pid: "host"
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--path.rootfs=/rootfs'
|
||||
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
@@ -0,0 +1,26 @@
|
||||
version: "3.8"
|
||||
|
||||
services:
|
||||
node-exporter:
|
||||
image: quay.io/prometheus/node-exporter:latest
|
||||
container_name: node_exporter
|
||||
network_mode: host
|
||||
pid: host
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--path.rootfs=/rootfs'
|
||||
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
restart: unless-stopped
|
||||
|
||||
snmp-exporter:
|
||||
image: quay.io/prometheus/snmp-exporter:latest
|
||||
container_name: snmp_exporter
|
||||
network_mode: host
|
||||
volumes:
|
||||
- /volume1/docker/snmp/snmp.yml:/etc/snmp_exporter/snmp.yml:ro
|
||||
restart: unless-stopped
|
||||
@@ -0,0 +1,98 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ["prometheus:9090"]
|
||||
|
||||
- job_name: "homelab-node"
|
||||
static_configs:
|
||||
- targets: ["100.67.40.126:9100"]
|
||||
|
||||
- job_name: "raspberry-pis"
|
||||
static_configs:
|
||||
- targets: ["100.77.151.40:9100"] # pi-5
|
||||
- targets: ["100.123.246.75:9100"] # pi-5-kevin
|
||||
|
||||
- job_name: "setillo-node"
|
||||
static_configs:
|
||||
- targets: ["100.125.0.20:9100"]
|
||||
|
||||
- job_name: "setillo-snmp"
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [synology]
|
||||
auth: [snmpv3]
|
||||
target: ["127.0.0.1"]
|
||||
static_configs:
|
||||
- targets: ["100.125.0.20:9116"]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
replacement: "127.0.0.1"
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
replacement: "100.125.0.20"
|
||||
- target_label: __address__
|
||||
replacement: "100.125.0.20:9116"
|
||||
|
||||
- job_name: "calypso-node"
|
||||
static_configs:
|
||||
- targets: ["100.103.48.78:9100"]
|
||||
|
||||
- job_name: "calypso-snmp"
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [synology]
|
||||
auth: [snmpv3]
|
||||
target: ["127.0.0.1"]
|
||||
static_configs:
|
||||
- targets: ["100.103.48.78:9116"]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
replacement: "127.0.0.1"
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
replacement: "100.103.48.78"
|
||||
- target_label: __address__
|
||||
replacement: "100.103.48.78:9116"
|
||||
|
||||
- job_name: "atlantis-node"
|
||||
static_configs:
|
||||
- targets: ["100.83.230.112:9100"]
|
||||
|
||||
- job_name: "atlantis-snmp"
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [synology]
|
||||
auth: [snmpv3]
|
||||
target: ["127.0.0.1"]
|
||||
static_configs:
|
||||
- targets: ["100.83.230.112:9116"]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
replacement: "127.0.0.1"
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
replacement: "100.83.230.112"
|
||||
- target_label: __address__
|
||||
replacement: "100.83.230.112:9116"
|
||||
|
||||
- job_name: "concord-nuc-node"
|
||||
static_configs:
|
||||
- targets: ["100.72.55.21:9100"]
|
||||
|
||||
- job_name: "truenas-node"
|
||||
static_configs:
|
||||
- targets: ["100.75.252.64:9100"]
|
||||
|
||||
- job_name: "vmi2076105-node"
|
||||
static_configs:
|
||||
- targets: ["100.99.156.20:9100"]
|
||||
|
||||
- job_name: "proxmox-node"
|
||||
static_configs:
|
||||
- targets: ["100.87.12.28:9100"]
|
||||
@@ -0,0 +1,582 @@
|
||||
# Synology SNMP Exporter Configuration
|
||||
# Comprehensive config for monitoring Synology NAS devices
|
||||
# Includes: CPU, Memory, Load, Storage, Network, Disks, RAID, Temperature
|
||||
|
||||
auths:
|
||||
snmpv3:
|
||||
version: 3
|
||||
security_level: authPriv
|
||||
auth_protocol: MD5
|
||||
username: snmp-exporter
|
||||
password: "REDACTED_PASSWORD"
|
||||
priv_protocol: DES
|
||||
priv_password: "REDACTED_PASSWORD"
|
||||
|
||||
modules:
|
||||
synology:
|
||||
walk:
|
||||
# Standard MIBs
|
||||
- 1.3.6.1.2.1.1 # System info (sysDescr, sysUpTime, etc.)
|
||||
- 1.3.6.1.2.1.2 # Interfaces
|
||||
- 1.3.6.1.2.1.25.2 # hrStorage (disk/memory usage)
|
||||
- 1.3.6.1.2.1.25.3.3 # hrProcessorLoad
|
||||
- 1.3.6.1.2.1.31.1.1 # ifXTable (64-bit counters)
|
||||
|
||||
# UCD-SNMP-MIB (CPU, Memory, Load)
|
||||
- 1.3.6.1.4.1.2021.4 # Memory stats
|
||||
- 1.3.6.1.4.1.2021.10 # Load average
|
||||
- 1.3.6.1.4.1.2021.11 # CPU stats
|
||||
|
||||
# Synology-specific MIBs
|
||||
- 1.3.6.1.4.1.6574.1 # System status, temp, power, fans, model
|
||||
- 1.3.6.1.4.1.6574.2 # Disk information
|
||||
- 1.3.6.1.4.1.6574.3 # RAID status
|
||||
- 1.3.6.1.4.1.6574.4 # UPS status
|
||||
- 1.3.6.1.4.1.6574.5 # Disk SMART info
|
||||
- 1.3.6.1.4.1.6574.6 # Service users
|
||||
- 1.3.6.1.4.1.6574.101 # Storage IO
|
||||
- 1.3.6.1.4.1.6574.102 # Space IO
|
||||
- 1.3.6.1.4.1.6574.104 # GPU info (if available)
|
||||
|
||||
metrics:
|
||||
# ============================================
|
||||
# SYSTEM INFO
|
||||
# ============================================
|
||||
- name: sysDescr
|
||||
oid: 1.3.6.1.2.1.1.1
|
||||
type: DisplayString
|
||||
help: System description
|
||||
|
||||
- name: sysUpTime
|
||||
oid: 1.3.6.1.2.1.1.3
|
||||
type: gauge
|
||||
help: System uptime in hundredths of a second
|
||||
|
||||
- name: sysName
|
||||
oid: 1.3.6.1.2.1.1.5
|
||||
type: DisplayString
|
||||
help: System name
|
||||
|
||||
# ============================================
|
||||
# CPU METRICS (UCD-SNMP-MIB)
|
||||
# ============================================
|
||||
- name: ssCpuRawUser
|
||||
oid: 1.3.6.1.4.1.2021.11.50
|
||||
type: counter
|
||||
help: Raw CPU user time
|
||||
|
||||
- name: ssCpuRawNice
|
||||
oid: 1.3.6.1.4.1.2021.11.51
|
||||
type: counter
|
||||
help: Raw CPU nice time
|
||||
|
||||
- name: ssCpuRawSystem
|
||||
oid: 1.3.6.1.4.1.2021.11.52
|
||||
type: counter
|
||||
help: Raw CPU system time
|
||||
|
||||
- name: ssCpuRawIdle
|
||||
oid: 1.3.6.1.4.1.2021.11.53
|
||||
type: counter
|
||||
help: Raw CPU idle time
|
||||
|
||||
- name: ssCpuRawWait
|
||||
oid: 1.3.6.1.4.1.2021.11.54
|
||||
type: counter
|
||||
help: Raw CPU wait time
|
||||
|
||||
- name: ssCpuRawKernel
|
||||
oid: 1.3.6.1.4.1.2021.11.55
|
||||
type: counter
|
||||
help: Raw CPU kernel time
|
||||
|
||||
- name: ssCpuRawInterrupt
|
||||
oid: 1.3.6.1.4.1.2021.11.56
|
||||
type: counter
|
||||
help: Raw CPU interrupt time
|
||||
|
||||
# ============================================
|
||||
# MEMORY METRICS (UCD-SNMP-MIB)
|
||||
# ============================================
|
||||
- name: memTotalSwap
|
||||
oid: 1.3.6.1.4.1.2021.4.3
|
||||
type: gauge
|
||||
help: Total swap size in KB
|
||||
|
||||
- name: memAvailSwap
|
||||
oid: 1.3.6.1.4.1.2021.4.4
|
||||
type: gauge
|
||||
help: Available swap in KB
|
||||
|
||||
- name: memTotalReal
|
||||
oid: 1.3.6.1.4.1.2021.4.5
|
||||
type: gauge
|
||||
help: Total RAM in KB
|
||||
|
||||
- name: memAvailReal
|
||||
oid: 1.3.6.1.4.1.2021.4.6
|
||||
type: gauge
|
||||
help: Available RAM in KB
|
||||
|
||||
- name: memTotalFree
|
||||
oid: 1.3.6.1.4.1.2021.4.11
|
||||
type: gauge
|
||||
help: Total free memory in KB
|
||||
|
||||
- name: memShared
|
||||
oid: 1.3.6.1.4.1.2021.4.13
|
||||
type: gauge
|
||||
help: Shared memory in KB
|
||||
|
||||
- name: memBuffer
|
||||
oid: 1.3.6.1.4.1.2021.4.14
|
||||
type: gauge
|
||||
help: Buffer memory in KB
|
||||
|
||||
- name: memCached
|
||||
oid: 1.3.6.1.4.1.2021.4.15
|
||||
type: gauge
|
||||
help: Cached memory in KB
|
||||
|
||||
# ============================================
|
||||
# LOAD AVERAGE (UCD-SNMP-MIB)
|
||||
# ============================================
|
||||
- name: laLoad
|
||||
oid: 1.3.6.1.4.1.2021.10.1.3
|
||||
type: DisplayString
|
||||
help: Load average (1, 5, 15 min)
|
||||
indexes:
|
||||
- labelname: laIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [laIndex]
|
||||
labelname: laNames
|
||||
oid: 1.3.6.1.4.1.2021.10.1.2
|
||||
type: DisplayString
|
||||
|
||||
# ============================================
|
||||
# HOST RESOURCES - STORAGE
|
||||
# ============================================
|
||||
- name: hrStorageDescr
|
||||
oid: 1.3.6.1.2.1.25.2.3.1.3
|
||||
type: DisplayString
|
||||
help: Storage description
|
||||
indexes:
|
||||
- labelname: hrStorageIndex
|
||||
type: gauge
|
||||
|
||||
- name: hrStorageAllocationUnits
|
||||
oid: 1.3.6.1.2.1.25.2.3.1.4
|
||||
type: gauge
|
||||
help: Storage allocation unit size in bytes
|
||||
indexes:
|
||||
- labelname: hrStorageIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [hrStorageIndex]
|
||||
labelname: hrStorageDescr
|
||||
oid: 1.3.6.1.2.1.25.2.3.1.3
|
||||
type: DisplayString
|
||||
|
||||
- name: hrStorageSize
|
||||
oid: 1.3.6.1.2.1.25.2.3.1.5
|
||||
type: gauge
|
||||
help: Storage size in allocation units
|
||||
indexes:
|
||||
- labelname: hrStorageIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [hrStorageIndex]
|
||||
labelname: hrStorageDescr
|
||||
oid: 1.3.6.1.2.1.25.2.3.1.3
|
||||
type: DisplayString
|
||||
|
||||
- name: hrStorageUsed
|
||||
oid: 1.3.6.1.2.1.25.2.3.1.6
|
||||
type: gauge
|
||||
help: Storage used in allocation units
|
||||
indexes:
|
||||
- labelname: hrStorageIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [hrStorageIndex]
|
||||
labelname: hrStorageDescr
|
||||
oid: 1.3.6.1.2.1.25.2.3.1.3
|
||||
type: DisplayString
|
||||
|
||||
# ============================================
|
||||
# NETWORK INTERFACES
|
||||
# ============================================
|
||||
- name: ifNumber
|
||||
oid: 1.3.6.1.2.1.2.1
|
||||
type: gauge
|
||||
help: Number of network interfaces
|
||||
|
||||
- name: ifDescr
|
||||
oid: 1.3.6.1.2.1.2.2.1.2
|
||||
type: DisplayString
|
||||
help: Interface description
|
||||
indexes:
|
||||
- labelname: ifIndex
|
||||
type: gauge
|
||||
|
||||
- name: ifOperStatus
|
||||
oid: 1.3.6.1.2.1.2.2.1.8
|
||||
type: gauge
|
||||
help: Interface operational status (1=up, 2=down)
|
||||
indexes:
|
||||
- labelname: ifIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [ifIndex]
|
||||
labelname: ifDescr
|
||||
oid: 1.3.6.1.2.1.2.2.1.2
|
||||
type: DisplayString
|
||||
enum_values:
|
||||
1: up
|
||||
2: down
|
||||
3: testing
|
||||
|
||||
- name: ifHCInOctets
|
||||
oid: 1.3.6.1.2.1.31.1.1.1.6
|
||||
type: counter
|
||||
help: Total bytes received (64-bit)
|
||||
indexes:
|
||||
- labelname: ifIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [ifIndex]
|
||||
labelname: ifDescr
|
||||
oid: 1.3.6.1.2.1.2.2.1.2
|
||||
type: DisplayString
|
||||
|
||||
- name: ifHCOutOctets
|
||||
oid: 1.3.6.1.2.1.31.1.1.1.10
|
||||
type: counter
|
||||
help: Total bytes transmitted (64-bit)
|
||||
indexes:
|
||||
- labelname: ifIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [ifIndex]
|
||||
labelname: ifDescr
|
||||
oid: 1.3.6.1.2.1.2.2.1.2
|
||||
type: DisplayString
|
||||
|
||||
# ============================================
|
||||
# SYNOLOGY SYSTEM STATUS
|
||||
# ============================================
|
||||
- name: systemStatus
|
||||
oid: 1.3.6.1.4.1.6574.1.1
|
||||
type: gauge
|
||||
help: System status (1=Normal, 2=Failed)
|
||||
|
||||
- name: temperature
|
||||
oid: 1.3.6.1.4.1.6574.1.2
|
||||
type: gauge
|
||||
help: System temperature in Celsius
|
||||
|
||||
- name: powerStatus
|
||||
oid: 1.3.6.1.4.1.6574.1.3
|
||||
type: gauge
|
||||
help: Power status (1=Normal, 2=Failed)
|
||||
|
||||
- name: systemFanStatus
|
||||
oid: 1.3.6.1.4.1.6574.1.4.1
|
||||
type: gauge
|
||||
help: System fan status (1=Normal, 2=Failed)
|
||||
|
||||
- name: cpuFanStatus
|
||||
oid: 1.3.6.1.4.1.6574.1.4.2
|
||||
type: gauge
|
||||
help: CPU fan status (1=Normal, 2=Failed)
|
||||
|
||||
- name: modelName
|
||||
oid: 1.3.6.1.4.1.6574.1.5.1
|
||||
type: DisplayString
|
||||
help: NAS model name
|
||||
|
||||
- name: serialNumber
|
||||
oid: 1.3.6.1.4.1.6574.1.5.2
|
||||
type: DisplayString
|
||||
help: NAS serial number
|
||||
|
||||
- name: version
|
||||
oid: 1.3.6.1.4.1.6574.1.5.3
|
||||
type: DisplayString
|
||||
help: DSM version
|
||||
|
||||
- name: REDACTED_APP_PASSWORD
|
||||
oid: 1.3.6.1.4.1.6574.1.5.4
|
||||
type: gauge
|
||||
help: DSM upgrade available (1=available, 2=unavailable)
|
||||
|
||||
# ============================================
|
||||
# SYNOLOGY DISK INFO
|
||||
# ============================================
|
||||
- name: diskID
|
||||
oid: 1.3.6.1.4.1.6574.2.1.1.2
|
||||
type: DisplayString
|
||||
help: Disk ID
|
||||
indexes:
|
||||
- labelname: diskIndex
|
||||
type: gauge
|
||||
|
||||
- name: diskModel
|
||||
oid: 1.3.6.1.4.1.6574.2.1.1.3
|
||||
type: DisplayString
|
||||
help: Disk model
|
||||
indexes:
|
||||
- labelname: diskIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [diskIndex]
|
||||
labelname: diskID
|
||||
oid: 1.3.6.1.4.1.6574.2.1.1.2
|
||||
type: DisplayString
|
||||
|
||||
- name: diskType
|
||||
oid: 1.3.6.1.4.1.6574.2.1.1.4
|
||||
type: DisplayString
|
||||
help: Disk type (SATA, SSD, etc.)
|
||||
indexes:
|
||||
- labelname: diskIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [diskIndex]
|
||||
labelname: diskID
|
||||
oid: 1.3.6.1.4.1.6574.2.1.1.2
|
||||
type: DisplayString
|
||||
|
||||
- name: diskStatus
|
||||
oid: 1.3.6.1.4.1.6574.2.1.1.5
|
||||
type: gauge
|
||||
help: Disk status (1=Normal, 2=Initialized, 3=NotInitialized, 4=SystemPartitionFailed, 5=Crashed)
|
||||
indexes:
|
||||
- labelname: diskIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [diskIndex]
|
||||
labelname: diskID
|
||||
oid: 1.3.6.1.4.1.6574.2.1.1.2
|
||||
type: DisplayString
|
||||
|
||||
- name: diskTemperature
|
||||
oid: 1.3.6.1.4.1.6574.2.1.1.6
|
||||
type: gauge
|
||||
help: Disk temperature in Celsius
|
||||
indexes:
|
||||
- labelname: diskIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [diskIndex]
|
||||
labelname: diskID
|
||||
oid: 1.3.6.1.4.1.6574.2.1.1.2
|
||||
type: DisplayString
|
||||
|
||||
# ============================================
|
||||
# SYNOLOGY RAID INFO
|
||||
# ============================================
|
||||
- name: raidName
|
||||
oid: 1.3.6.1.4.1.6574.3.1.1.2
|
||||
type: DisplayString
|
||||
help: RAID/Volume name
|
||||
indexes:
|
||||
- labelname: raidIndex
|
||||
type: gauge
|
||||
|
||||
- name: raidStatus
|
||||
oid: 1.3.6.1.4.1.6574.3.1.1.3
|
||||
type: gauge
|
||||
help: RAID status (1=Normal, 2=Repairing, 3=Migrating, 4=Expanding, 5=Deleting, 6=Creating, 7=RaidSyncing, 8=RaidParityChecking, 9=RaidAssembling, 10=Canceling, 11=Degrade, 12=Crashed, 13=DataScrubbing, 14=RaidDeploying, 15=RaidUnDeploying, 16=RaidMountCache, 17=REDACTED_APP_PASSWORD, 18=RaidExpandingUnfinishedSHR, 19=RaidConvertSHRToPool, 20=RaidMigrateSHR1ToSHR2, 21=RaidUnknownStatus)
|
||||
indexes:
|
||||
- labelname: raidIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [raidIndex]
|
||||
labelname: raidName
|
||||
oid: 1.3.6.1.4.1.6574.3.1.1.2
|
||||
type: DisplayString
|
||||
|
||||
- name: raidFreeSize
|
||||
oid: 1.3.6.1.4.1.6574.3.1.1.4
|
||||
type: gauge
|
||||
help: RAID free size in bytes
|
||||
indexes:
|
||||
- labelname: raidIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [raidIndex]
|
||||
labelname: raidName
|
||||
oid: 1.3.6.1.4.1.6574.3.1.1.2
|
||||
type: DisplayString
|
||||
|
||||
- name: raidTotalSize
|
||||
oid: 1.3.6.1.4.1.6574.3.1.1.5
|
||||
type: gauge
|
||||
help: RAID total size in bytes
|
||||
indexes:
|
||||
- labelname: raidIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [raidIndex]
|
||||
labelname: raidName
|
||||
oid: 1.3.6.1.4.1.6574.3.1.1.2
|
||||
type: DisplayString
|
||||
|
||||
# ============================================
|
||||
# SYNOLOGY UPS INFO (if connected)
|
||||
# ============================================
|
||||
- name: upsModel
|
||||
oid: 1.3.6.1.4.1.6574.4.1.1
|
||||
type: DisplayString
|
||||
help: UPS model name
|
||||
|
||||
- name: upsSN
|
||||
oid: 1.3.6.1.4.1.6574.4.1.2
|
||||
type: DisplayString
|
||||
help: UPS serial number
|
||||
|
||||
- name: upsStatus
|
||||
oid: 1.3.6.1.4.1.6574.4.1.3
|
||||
type: DisplayString
|
||||
help: UPS status
|
||||
|
||||
- name: upsLoad
|
||||
oid: 1.3.6.1.4.1.6574.4.2.1
|
||||
type: gauge
|
||||
help: UPS load percentage
|
||||
|
||||
- name: REDACTED_APP_PASSWORD
|
||||
oid: 1.3.6.1.4.1.6574.4.3.1.1
|
||||
type: gauge
|
||||
help: UPS battery charge percentage
|
||||
|
||||
- name: upsBatteryChargeWarning
|
||||
oid: 1.3.6.1.4.1.6574.4.3.1.2
|
||||
type: gauge
|
||||
help: UPS battery charge warning level
|
||||
|
||||
# ============================================
|
||||
# SYNOLOGY SERVICE USERS
|
||||
# ============================================
|
||||
- name: serviceName
|
||||
oid: 1.3.6.1.4.1.6574.6.1.1.2
|
||||
type: DisplayString
|
||||
help: Service name
|
||||
indexes:
|
||||
- labelname: REDACTED_APP_PASSWORD
|
||||
type: gauge
|
||||
|
||||
- name: serviceUsers
|
||||
oid: 1.3.6.1.4.1.6574.6.1.1.3
|
||||
type: gauge
|
||||
help: Number of users connected to service
|
||||
indexes:
|
||||
- labelname: REDACTED_APP_PASSWORD
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [serviceInfoIndex]
|
||||
labelname: serviceName
|
||||
oid: 1.3.6.1.4.1.6574.6.1.1.2
|
||||
type: DisplayString
|
||||
|
||||
# ============================================
|
||||
# SYNOLOGY STORAGE IO
|
||||
# ============================================
|
||||
- name: storageIODevice
|
||||
oid: 1.3.6.1.4.1.6574.101.1.1.2
|
||||
type: DisplayString
|
||||
help: Storage IO device name
|
||||
indexes:
|
||||
- labelname: storageIOIndex
|
||||
type: gauge
|
||||
|
||||
- name: storageIONReadX
|
||||
oid: 1.3.6.1.4.1.6574.101.1.1.12
|
||||
type: counter
|
||||
help: Total bytes read (64-bit)
|
||||
indexes:
|
||||
- labelname: storageIOIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [storageIOIndex]
|
||||
labelname: storageIODevice
|
||||
oid: 1.3.6.1.4.1.6574.101.1.1.2
|
||||
type: DisplayString
|
||||
|
||||
- name: storageIONWrittenX
|
||||
oid: 1.3.6.1.4.1.6574.101.1.1.13
|
||||
type: counter
|
||||
help: Total bytes written (64-bit)
|
||||
indexes:
|
||||
- labelname: storageIOIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [storageIOIndex]
|
||||
labelname: storageIODevice
|
||||
oid: 1.3.6.1.4.1.6574.101.1.1.2
|
||||
type: DisplayString
|
||||
|
||||
- name: storageIOLA
|
||||
oid: 1.3.6.1.4.1.6574.101.1.1.8
|
||||
type: gauge
|
||||
help: Storage IO load average
|
||||
indexes:
|
||||
- labelname: storageIOIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [storageIOIndex]
|
||||
labelname: storageIODevice
|
||||
oid: 1.3.6.1.4.1.6574.101.1.1.2
|
||||
type: DisplayString
|
||||
|
||||
# ============================================
|
||||
# SYNOLOGY SPACE IO (Volume IO)
|
||||
# ============================================
|
||||
- name: spaceIODevice
|
||||
oid: 1.3.6.1.4.1.6574.102.1.1.2
|
||||
type: DisplayString
|
||||
help: Space/Volume IO device name
|
||||
indexes:
|
||||
- labelname: spaceIOIndex
|
||||
type: gauge
|
||||
|
||||
- name: spaceIONReadX
|
||||
oid: 1.3.6.1.4.1.6574.102.1.1.12
|
||||
type: counter
|
||||
help: Volume bytes read (64-bit)
|
||||
indexes:
|
||||
- labelname: spaceIOIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [spaceIOIndex]
|
||||
labelname: spaceIODevice
|
||||
oid: 1.3.6.1.4.1.6574.102.1.1.2
|
||||
type: DisplayString
|
||||
|
||||
- name: REDACTED_APP_PASSWORD
|
||||
oid: 1.3.6.1.4.1.6574.102.1.1.13
|
||||
type: counter
|
||||
help: Volume bytes written (64-bit)
|
||||
indexes:
|
||||
- labelname: spaceIOIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [spaceIOIndex]
|
||||
labelname: spaceIODevice
|
||||
oid: 1.3.6.1.4.1.6574.102.1.1.2
|
||||
type: DisplayString
|
||||
|
||||
- name: spaceIOLA
|
||||
oid: 1.3.6.1.4.1.6574.102.1.1.8
|
||||
type: gauge
|
||||
help: Volume IO load average
|
||||
indexes:
|
||||
- labelname: spaceIOIndex
|
||||
type: gauge
|
||||
lookups:
|
||||
- labels: [spaceIOIndex]
|
||||
labelname: spaceIODevice
|
||||
oid: 1.3.6.1.4.1.6574.102.1.1.2
|
||||
type: DisplayString
|
||||
@@ -0,0 +1 @@
|
||||
1-y71kjkcRGpoNXqSABU07nwduE0jUOrVXVfYOcSPdoZlPuFbKNG1gIPou74HcdqTr
|
||||
@@ -0,0 +1,62 @@
|
||||
# Prometheus + Grafana Monitoring Stack
|
||||
# Ports: 9090 (Prometheus), 3300 (Grafana)
|
||||
#
|
||||
# Config files are in prometheus/ and grafana/ subdirectories relative to this file
|
||||
# Dashboards provisioned: infrastructure-overview, node-details, node-exporter, synology-monitoring
|
||||
|
||||
services:
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: prometheus
|
||||
volumes:
|
||||
- ./prometheus:/etc/prometheus
|
||||
- prometheus-data:/prometheus
|
||||
command:
|
||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||
- "--storage.tsdb.path=/prometheus"
|
||||
- "--web.enable-lifecycle"
|
||||
ports:
|
||||
- "9090:9090"
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana-oss:latest
|
||||
container_name: grafana
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
- GF_SECURITY_ADMIN_PASSWORD="REDACTED_PASSWORD"
|
||||
volumes:
|
||||
- grafana-data:/var/lib/grafana
|
||||
- ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro
|
||||
- ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro
|
||||
- ./grafana/dashboards:/etc/grafana/dashboards:ro
|
||||
ports:
|
||||
- "3300:3000"
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
- prometheus
|
||||
networks:
|
||||
- monitoring
|
||||
|
||||
node_exporter:
|
||||
image: prom/node-exporter:latest
|
||||
container_name: node_exporter
|
||||
network_mode: host
|
||||
pid: host
|
||||
volumes:
|
||||
- /:/host:ro,rslave
|
||||
- /sys:/host/sys:ro
|
||||
- /proc:/host/proc:ro
|
||||
command:
|
||||
- '--path.rootfs=/host'
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
prometheus-data:
|
||||
grafana-data:
|
||||
|
||||
networks:
|
||||
monitoring:
|
||||
driver: bridge
|
||||
@@ -0,0 +1,366 @@
|
||||
{
|
||||
"uid": "infrastructure-overview-v2",
|
||||
"title": "Infrastructure Overview - All Devices",
|
||||
"tags": [
|
||||
"infrastructure",
|
||||
"node-exporter",
|
||||
"tailscale"
|
||||
],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 38,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Data Source",
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": "",
|
||||
"current": {},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"definition": "label_values(node_uname_info, job)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Host",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values(node_uname_info, job)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Device Status",
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": {
|
||||
"text": "DOWN",
|
||||
"color": "red"
|
||||
},
|
||||
"1": {
|
||||
"text": "UP",
|
||||
"color": "green"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"textMode": "value_and_name",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{job=~\"$job\"}",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 5
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"min": 0
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (avg by(job) (rate(node_cpu_seconds_total{mode=\"idle\", job=~\"$job\"}[5m])) * 100)",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 5
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"min": 0
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(1 - (node_memory_MemAvailable_bytes{job=~\"$job\"} / node_memory_MemTotal_bytes{job=~\"$job\"})) * 100",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "bargauge",
|
||||
"title": "Root Disk Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 13
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 70
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 85
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"displayMode": "gradient",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - ((node_filesystem_avail_bytes{job=~\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"} / node_filesystem_size_bytes{job=~\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"}) * 100)",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "stat",
|
||||
"title": "Uptime",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 13
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_time_seconds{job=~\"$job\"} - node_boot_time_seconds{job=~\"$job\"}",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "timeseries",
|
||||
"title": "Network Receive",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 21
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(job) (rate(node_network_receive_bytes_total{job=~\"$job\", device!~\"lo|docker.*|br-.*|veth.*\"}[5m]))",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"type": "timeseries",
|
||||
"title": "Network Transmit",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 21
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by(job) (rate(node_network_transmit_bytes_total{job=~\"$job\", device!~\"lo|docker.*|br-.*|veth.*\"}[5m]))",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,936 @@
|
||||
{
|
||||
"uid": "node-details-v2",
|
||||
"title": "Node Details - Full Metrics",
|
||||
"tags": [
|
||||
"node-exporter",
|
||||
"detailed",
|
||||
"infrastructure"
|
||||
],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 38,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": "prometheus",
|
||||
"value": "prometheus"
|
||||
},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Data Source",
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"current": {},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"definition": "label_values(node_uname_info, job)",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Host",
|
||||
"multi": false,
|
||||
"name": "job",
|
||||
"options": [],
|
||||
"query": "label_values(node_uname_info, job)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"current": {},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"definition": "label_values(node_uname_info{job=\"$job\"}, instance)",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Instance",
|
||||
"multi": false,
|
||||
"name": "instance",
|
||||
"options": [],
|
||||
"query": "label_values(node_uname_info{job=\"$job\"}, instance)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "row",
|
||||
"title": "\ud83d\udcca Quick Stats",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Uptime",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 4,
|
||||
"x": 0,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_time_seconds{job=\"$job\",instance=\"$instance\"} - node_boot_time_seconds{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Uptime",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "stat",
|
||||
"title": "CPU Cores",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 3,
|
||||
"x": 4,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "blue",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"idle\"})",
|
||||
"legendFormat": "Cores",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Total RAM",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 3,
|
||||
"x": 7,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "purple",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_memory_MemTotal_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "RAM",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "gauge",
|
||||
"title": "CPU",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 3,
|
||||
"x": 10,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 60
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (avg(rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"idle\"}[5m])) * 100)",
|
||||
"legendFormat": "CPU",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "gauge",
|
||||
"title": "Memory",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 3,
|
||||
"x": 13,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 70
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 85
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(1 - (node_memory_MemAvailable_bytes{job=\"$job\",instance=\"$instance\"} / node_memory_MemTotal_bytes{job=\"$job\",instance=\"$instance\"})) * 100",
|
||||
"legendFormat": "Memory",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"type": "gauge",
|
||||
"title": "Disk /",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 3,
|
||||
"x": 16,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 70
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 85
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - ((node_filesystem_avail_bytes{job=\"$job\",instance=\"$instance\",mountpoint=\"/\",fstype!=\"rootfs\"} / node_filesystem_size_bytes{job=\"$job\",instance=\"$instance\",mountpoint=\"/\",fstype!=\"rootfs\"}) * 100)",
|
||||
"legendFormat": "Disk",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "stat",
|
||||
"title": "Load 1m",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 2,
|
||||
"x": 19,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"decimals": 2,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_load1{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "1m",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"type": "stat",
|
||||
"title": "Load 5m",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 2,
|
||||
"x": 21,
|
||||
"y": 1
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"decimals": 2,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_load5{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "5m",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"type": "row",
|
||||
"title": "\ud83d\udda5\ufe0f CPU Details",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 5
|
||||
},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Usage Breakdown",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 6
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"custom": {
|
||||
"fillOpacity": 50,
|
||||
"stacking": {
|
||||
"mode": "normal",
|
||||
"group": "A"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "avg(rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"user\"}[5m])) * 100",
|
||||
"legendFormat": "User",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "avg(rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"system\"}[5m])) * 100",
|
||||
"legendFormat": "System",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "avg(rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"iowait\"}[5m])) * 100",
|
||||
"legendFormat": "IOWait",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "avg(rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"steal\"}[5m])) * 100",
|
||||
"legendFormat": "Steal",
|
||||
"refId": "D"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"type": "timeseries",
|
||||
"title": "CPU Per Core",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 6
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"min": 0
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - (rate(node_cpu_seconds_total{job=\"$job\",instance=\"$instance\",mode=\"idle\"}[5m]) * 100)",
|
||||
"legendFormat": "CPU {{cpu}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"type": "row",
|
||||
"title": "\ud83e\udde0 Memory Details",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 14
|
||||
},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 21,
|
||||
"type": "timeseries",
|
||||
"title": "Memory Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 15
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes",
|
||||
"custom": {
|
||||
"fillOpacity": 30,
|
||||
"stacking": {
|
||||
"mode": "normal",
|
||||
"group": "A"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_memory_MemTotal_bytes{job=\"$job\",instance=\"$instance\"} - node_memory_MemAvailable_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Used",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_Buffers_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Buffers",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_Cached_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Cached",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_MemFree_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Free",
|
||||
"refId": "D"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
"type": "timeseries",
|
||||
"title": "Swap Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 15
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "node_memory_SwapTotal_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Total",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "node_memory_SwapTotal_bytes{job=\"$job\",instance=\"$instance\"} - node_memory_SwapFree_bytes{job=\"$job\",instance=\"$instance\"}",
|
||||
"legendFormat": "Used",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"type": "row",
|
||||
"title": "\ud83d\udcbe Disk Details",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 23
|
||||
},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 31,
|
||||
"type": "bargauge",
|
||||
"title": "Disk Space Usage",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 70
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 85
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"displayMode": "gradient",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - ((node_filesystem_avail_bytes{job=\"$job\",instance=\"$instance\",fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{job=\"$job\",instance=\"$instance\",fstype!~\"tmpfs|overlay|squashfs\"}) * 100)",
|
||||
"legendFormat": "{{mountpoint}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 32,
|
||||
"type": "timeseries",
|
||||
"title": "Disk I/O",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 24
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byRegexp",
|
||||
"options": ".*Write.*"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "custom.transform",
|
||||
"value": "negative-Y"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_disk_read_bytes_total{job=\"$job\",instance=\"$instance\",device!~\"loop.*|dm-.*\"}[5m])",
|
||||
"legendFormat": "{{device}} Read",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_disk_written_bytes_total{job=\"$job\",instance=\"$instance\",device!~\"loop.*|dm-.*\"}[5m])",
|
||||
"legendFormat": "{{device}} Write",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 40,
|
||||
"type": "row",
|
||||
"title": "\ud83c\udf10 Network Details",
|
||||
"gridPos": {
|
||||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 32
|
||||
},
|
||||
"collapsed": false
|
||||
},
|
||||
{
|
||||
"id": 41,
|
||||
"type": "timeseries",
|
||||
"title": "Network Traffic",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 33
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bps"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {
|
||||
"id": "byRegexp",
|
||||
"options": ".*TX.*"
|
||||
},
|
||||
"properties": [
|
||||
{
|
||||
"id": "custom.transform",
|
||||
"value": "negative-Y"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean",
|
||||
"max"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_receive_bytes_total{job=\"$job\",instance=\"$instance\",device!~\"lo|docker.*|br-.*|veth.*\"}[5m]) * 8",
|
||||
"legendFormat": "{{device}} RX",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_network_transmit_bytes_total{job=\"$job\",instance=\"$instance\",device!~\"lo|docker.*|br-.*|veth.*\"}[5m]) * 8",
|
||||
"legendFormat": "{{device}} TX",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 42,
|
||||
"type": "timeseries",
|
||||
"title": "Network Errors",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 33
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "pps"
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(node_network_receive_errs_total{job=\"$job\",instance=\"$instance\",device!~\"lo|docker.*|br-.*|veth.*\"}[5m])",
|
||||
"legendFormat": "{{device}} RX Errors",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(node_network_transmit_errs_total{job=\"$job\",instance=\"$instance\",device!~\"lo|docker.*|br-.*|veth.*\"}[5m])",
|
||||
"legendFormat": "{{device}} TX Errors",
|
||||
"refId": "B"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"id": null
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,351 @@
|
||||
{
|
||||
"uid": "synology-dashboard-v2",
|
||||
"title": "Synology NAS Monitoring",
|
||||
"tags": [
|
||||
"synology",
|
||||
"nas",
|
||||
"snmp"
|
||||
],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 38,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {},
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Data Source",
|
||||
"multi": false,
|
||||
"name": "datasource",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"type": "datasource"
|
||||
},
|
||||
{
|
||||
"allValue": "",
|
||||
"current": {},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"definition": "label_values(ssCpuRawIdle, job)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "NAS",
|
||||
"multi": true,
|
||||
"name": "job",
|
||||
"query": "label_values(ssCpuRawIdle, job)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "NAS Status",
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"mappings": [
|
||||
{
|
||||
"type": "value",
|
||||
"options": {
|
||||
"0": {
|
||||
"text": "DOWN",
|
||||
"color": "red"
|
||||
},
|
||||
"1": {
|
||||
"text": "UP",
|
||||
"color": "green"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"textMode": "value_and_name",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "up{job=~\"$job\"}",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "gauge",
|
||||
"title": "CPU Usage",
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 0,
|
||||
"y": 4
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 60
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 80
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 - ((ssCpuRawIdle{job=~\"$job\"} / (ssCpuRawUser{job=~\"$job\"} + ssCpuRawSystem{job=~\"$job\"} + ssCpuRawIdle{job=~\"$job\"} + ssCpuRawWait{job=~\"$job\"})) * 100)",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "gauge",
|
||||
"title": "Memory Usage",
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 8,
|
||||
"y": 4
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "yellow",
|
||||
"value": 70
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 90
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "((memTotalReal{job=~\"$job\"} - memAvailReal{job=~\"$job\"}) / memTotalReal{job=~\"$job\"}) * 100",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "stat",
|
||||
"title": "Total Memory",
|
||||
"gridPos": {
|
||||
"h": 6,
|
||||
"w": 8,
|
||||
"x": 16,
|
||||
"y": 4
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "decbytes",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "blue",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "memTotalReal{job=~\"$job\"} * 1024",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "timeseries",
|
||||
"title": "Load Average",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "table",
|
||||
"placement": "right",
|
||||
"calcs": [
|
||||
"mean"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "laLoad{job=~\"$job\", laIndex=\"1\"}",
|
||||
"legendFormat": "{{job}} 1m",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "laLoad{job=~\"$job\", laIndex=\"2\"}",
|
||||
"legendFormat": "{{job}} 5m",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "laLoad{job=~\"$job\", laIndex=\"3\"}",
|
||||
"legendFormat": "{{job}} 15m",
|
||||
"refId": "C"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"type": "stat",
|
||||
"title": "Uptime",
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 10
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "eeyq1w1zddtkwb"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green",
|
||||
"value": null
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
]
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sysUpTime{job=~\"$job\"} / 100",
|
||||
"legendFormat": "{{job}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,13 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'Homelab Dashboards'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
folderUid: ''
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 30
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /etc/grafana/dashboards
|
||||
@@ -0,0 +1,9 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
@@ -0,0 +1,98 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ["prometheus:9090"]
|
||||
|
||||
- job_name: "homelab-node"
|
||||
static_configs:
|
||||
- targets: ["100.67.40.126:9100"]
|
||||
|
||||
- job_name: "raspberry-pis"
|
||||
static_configs:
|
||||
- targets: ["100.77.151.40:9100"] # pi-5
|
||||
- targets: ["100.123.246.75:9100"] # pi-5-kevin
|
||||
|
||||
- job_name: "setillo-node"
|
||||
static_configs:
|
||||
- targets: ["100.125.0.20:9100"]
|
||||
|
||||
- job_name: "setillo-snmp"
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [synology]
|
||||
auth: [snmpv3]
|
||||
target: ["127.0.0.1"]
|
||||
static_configs:
|
||||
- targets: ["100.125.0.20:9116"]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
replacement: "127.0.0.1"
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
replacement: "100.125.0.20"
|
||||
- target_label: __address__
|
||||
replacement: "100.125.0.20:9116"
|
||||
|
||||
- job_name: "calypso-node"
|
||||
static_configs:
|
||||
- targets: ["100.103.48.78:9100"]
|
||||
|
||||
- job_name: "calypso-snmp"
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [synology]
|
||||
auth: [snmpv3]
|
||||
target: ["127.0.0.1"]
|
||||
static_configs:
|
||||
- targets: ["100.103.48.78:9116"]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
replacement: "127.0.0.1"
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
replacement: "100.103.48.78"
|
||||
- target_label: __address__
|
||||
replacement: "100.103.48.78:9116"
|
||||
|
||||
- job_name: "atlantis-node"
|
||||
static_configs:
|
||||
- targets: ["100.83.230.112:9100"]
|
||||
|
||||
- job_name: "atlantis-snmp"
|
||||
metrics_path: /snmp
|
||||
params:
|
||||
module: [synology]
|
||||
auth: [snmpv3]
|
||||
target: ["127.0.0.1"]
|
||||
static_configs:
|
||||
- targets: ["100.83.230.112:9116"]
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
replacement: "127.0.0.1"
|
||||
- source_labels: [__param_target]
|
||||
target_label: instance
|
||||
replacement: "100.83.230.112"
|
||||
- target_label: __address__
|
||||
replacement: "100.83.230.112:9116"
|
||||
|
||||
- job_name: "concord-nuc-node"
|
||||
static_configs:
|
||||
- targets: ["100.72.55.21:9100"]
|
||||
|
||||
- job_name: "truenas-node"
|
||||
static_configs:
|
||||
- targets: ["100.75.252.64:9100"]
|
||||
|
||||
- job_name: "vmi2076105-node"
|
||||
static_configs:
|
||||
- targets: ["100.99.156.20:9100"]
|
||||
|
||||
- job_name: "proxmox-node"
|
||||
static_configs:
|
||||
- targets: ["100.87.12.28:9100"]
|
||||
Reference in New Issue
Block a user