Files
homelab-optimized/hosts/synology/atlantis/grafana_prometheus/monitoring-stack.yaml
Gitea Mirror Bot bd82e850ee
Some checks failed
Documentation / Deploy to GitHub Pages (push) Has been cancelled
Documentation / Build Docusaurus (push) Has started running
Sanitized mirror from private repository - 2026-03-21 11:14:37 UTC
2026-03-21 11:14:37 +00:00

279 lines
9.4 KiB
YAML

# =============================================================================
# HOMELAB MONITORING STACK - CRITICAL INFRASTRUCTURE VISIBILITY
# =============================================================================
#
# SERVICE OVERVIEW:
# - Complete monitoring solution for homelab infrastructure
# - Grafana: Visualization and dashboards
# - Prometheus: Metrics collection and storage
# - Node Exporter: System metrics (CPU, memory, disk, network)
# - SNMP Exporter: Network device monitoring (router, switches)
# - cAdvisor: Container metrics and resource usage
# - Blackbox Exporter: Service availability and response times
# - Speedtest Exporter: Internet connection monitoring
#
# DISASTER RECOVERY PRIORITY: HIGH
# - Essential for infrastructure visibility during outages
# - Contains historical performance data
# - Critical for troubleshooting and capacity planning
#
# RECOVERY TIME OBJECTIVE (RTO): 30 minutes
# RECOVERY POINT OBJECTIVE (RPO): 4 hours (metrics retention)
#
# DEPENDENCIES:
# - Volume2 for data persistence (separate from Volume1)
# - Network access to all monitored systems
# - SNMP access to network devices
# - Docker socket access for container monitoring
#
# =============================================================================
version: '3'
services:
# ==========================================================================
# GRAFANA - Visualization and Dashboard Platform
# ==========================================================================
grafana:
# CONTAINER IMAGE:
# - grafana/grafana:latest: Official Grafana image
# - Consider pinning version for production: grafana/grafana:10.2.0
# - Auto-updates with Watchtower (monitor for breaking changes)
image: grafana/grafana:latest
# CONTAINER IDENTIFICATION:
# - Grafana: Clear identification for monitoring and logs
# - grafana: Internal hostname for service communication
container_name: Grafana
hostname: grafana
# NETWORK CONFIGURATION:
# - grafana-net: Isolated network for Grafana and data sources
# - Allows secure communication with Prometheus
# - Prevents unauthorized access to monitoring data
networks:
- grafana-net
# RESOURCE ALLOCATION:
# - mem_limit: 512MB (sufficient for dashboards and queries)
# - cpu_shares: 512 (medium priority, less than Prometheus)
# - Grafana is lightweight but needs memory for dashboard rendering
mem_limit: 512m
cpu_shares: 512
# SECURITY CONFIGURATION:
# - no-new-privileges: Prevents privilege escalation attacks
# - user: 1026:100 (Synology user/group for file permissions)
# - CRITICAL: Must match NAS permissions for data access
security_opt:
- no-new-privileges:true
user: 1026:100
# HEALTH MONITORING:
# - wget: Tests Grafana API health endpoint
# - /api/health: Built-in Grafana health check
# - Ensures web interface is responsive
healthcheck:
test: wget --no-verbose --tries=1 --spider http://localhost:3000/api/health
# NETWORK PORTS:
# - 7099:3000: External port 7099 maps to internal Grafana port 3000
# - Port 7099: Accessible via reverse proxy or direct access
# - Port 3000: Standard Grafana web interface port
ports:
- 7099:3000
# DATA PERSISTENCE:
# - /volume2/metadata/docker/grafana/data: Grafana configuration and data
# - Contains: Dashboards, data sources, users, alerts, plugins
# - BACKUP CRITICAL: Contains all dashboard configurations
# - Volume2: Separate from Volume1 for redundancy
volumes:
- /volume2/metadata/docker/grafana/data:/var/lib/grafana:rw
environment:
# TIMEZONE CONFIGURATION:
# - TZ: Timezone for logs and dashboard timestamps
# - Must match system timezone for accurate time series data
TZ: America/Los_Angeles
# PLUGIN INSTALLATION:
# - GF_INSTALL_PLUGINS: Comma-separated list of plugins to install
# - grafana-clock-panel: Clock widget for dashboards
# - grafana-simple-json-datasource: JSON data source support
# - natel-discrete-panel: Discrete value visualization
# - grafana-piechart-panel: Pie chart visualizations
# - Plugins installed automatically on container start
GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-simple-json-datasource,natel-discrete-panel,grafana-piechart-panel
# RESTART POLICY:
# - on-failure:5: Restart up to 5 times on failure
# - Critical for maintaining monitoring visibility
# - Prevents infinite restart loops
restart: on-failure:5
# ==========================================================================
# PROMETHEUS - Metrics Collection and Time Series Database
# ==========================================================================
prometheus:
# CONTAINER IMAGE:
# - prom/prometheus: Official Prometheus image
# - Latest stable version with security updates
# - Consider version pinning: prom/prometheus:v2.47.0
image: prom/prometheus
# PROMETHEUS CONFIGURATION:
# - --storage.tsdb.retention.time=60d: Keep metrics for 60 days
# - --config.file: Path to Prometheus configuration file
# - Retention period balances storage usage vs. historical data
command:
- '--storage.tsdb.retention.time=60d'
- '--config.file=/etc/prometheus/prometheus.yml'
# CONTAINER IDENTIFICATION:
# - Prometheus: Clear identification for monitoring
# - prometheus-server: Internal hostname for service communication
container_name: Prometheus
hostname: prometheus-server
# NETWORK CONFIGURATION:
# - grafana-net: Communication with Grafana for data queries
# - prometheus-net: Communication with exporters and targets
# - Dual network setup for security and organization
networks:
- grafana-net
- prometheus-net
# RESOURCE ALLOCATION:
# - mem_limit: 1GB (metrics database requires significant memory)
# - cpu_shares: 768 (high priority for metrics collection)
# - Memory usage scales with number of metrics and retention period
mem_limit: 1g
cpu_shares: 768
# SECURITY CONFIGURATION:
# - no-new-privileges: Prevents privilege escalation
# - user: 1026:100 (Synology permissions for data storage)
security_opt:
- no-new-privileges=true
user: 1026:100
# HEALTH MONITORING:
# - wget: Tests Prometheus web interface availability
# - Port 9090: Standard Prometheus web UI port
# - Ensures metrics collection is operational
healthcheck:
test: wget --no-verbose --tries=1 --spider http://localhost:9090/ || exit 1
# DATA PERSISTENCE:
# - /volume2/metadata/docker/grafana/prometheus: Time series database storage
# - /volume2/metadata/docker/grafana/prometheus.yml: Configuration file
# - BACKUP IMPORTANT: Contains historical metrics data
# - Configuration file defines scrape targets and rules
volumes:
- /volume2/metadata/docker/grafana/prometheus:/prometheus:rw
- /volume2/metadata/docker/grafana/prometheus.yml:/etc/prometheus/prometheus.yml:ro
# RESTART POLICY:
# - on-failure:5: Restart on failure to maintain metrics collection
# - Critical for continuous monitoring and alerting
restart: on-failure:5
node-exporter:
image: prom/node-exporter:latest
command:
- --collector.disable-defaults
- --collector.stat
- --collector.time
- --collector.cpu
- --collector.loadavg
- --collector.hwmon
- --collector.meminfo
- --collector.diskstats
container_name: Prometheus-Node
hostname: prometheus-node
networks:
- prometheus-net
mem_limit: 256m
mem_reservation: 64m
cpu_shares: 512
security_opt:
- no-new-privileges=true
read_only: true
user: 1026:100
healthcheck:
test: wget --no-verbose --tries=1 --spider http://localhost:9100/
restart: on-failure:5
snmp-exporter:
image: prom/snmp-exporter:latest
command:
- '--config.file=/etc/snmp_exporter/snmp.yml'
container_name: Prometheus-SNMP
hostname: prometheus-snmp
networks:
- prometheus-net
mem_limit: 256m
mem_reservation: 64m
cpu_shares: 512
security_opt:
- no-new-privileges:true
read_only: true
user: 1026:100
healthcheck:
test: wget --no-verbose --tries=1 --spider http://localhost:9116/ || exit 1
volumes:
- /volume2/metadata/docker/grafana/snmp:/etc/snmp_exporter/:ro
restart: on-failure:5
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
command:
- '--docker_only=true'
container_name: Prometheus-cAdvisor
hostname: prometheus-cadvisor
networks:
- prometheus-net
mem_limit: 256m
mem_reservation: 64m
cpu_shares: 512
security_opt:
- no-new-privileges=true
read_only: true
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
restart: on-failure:5
blackbox-exporter:
image: prom/blackbox-exporter
container_name: blackbox-exporter
networks:
- prometheus-net
ports:
- 9115:9115
restart: unless-stopped
speedtest-exporter:
image: miguelndecarvalho/speedtest-exporter
container_name: speedtest-exporter
networks:
- prometheus-net
ports:
- 9798:9798
restart: unless-stopped
networks:
grafana-net:
name: grafana-net
ipam:
config:
- subnet: 192.168.50.0/24
prometheus-net:
name: prometheus-net
ipam:
config:
- subnet: 192.168.51.0/24