# ============================================================================= # HOMELAB MONITORING STACK - CRITICAL INFRASTRUCTURE VISIBILITY # ============================================================================= # # SERVICE OVERVIEW: # - Complete monitoring solution for homelab infrastructure # - Grafana: Visualization and dashboards # - Prometheus: Metrics collection and storage # - Node Exporter: System metrics (CPU, memory, disk, network) # - SNMP Exporter: Network device monitoring (router, switches) # - cAdvisor: Container metrics and resource usage # - Blackbox Exporter: Service availability and response times # - Speedtest Exporter: Internet connection monitoring # # DISASTER RECOVERY PRIORITY: HIGH # - Essential for infrastructure visibility during outages # - Contains historical performance data # - Critical for troubleshooting and capacity planning # # RECOVERY TIME OBJECTIVE (RTO): 30 minutes # RECOVERY POINT OBJECTIVE (RPO): 4 hours (metrics retention) # # DEPENDENCIES: # - Volume2 for data persistence (separate from Volume1) # - Network access to all monitored systems # - SNMP access to network devices # - Docker socket access for container monitoring # # ============================================================================= version: '3' services: # ========================================================================== # GRAFANA - Visualization and Dashboard Platform # ========================================================================== grafana: # CONTAINER IMAGE: # - grafana/grafana:latest: Official Grafana image # - Consider pinning version for production: grafana/grafana:10.2.0 # - Auto-updates with Watchtower (monitor for breaking changes) image: grafana/grafana:latest # CONTAINER IDENTIFICATION: # - Grafana: Clear identification for monitoring and logs # - grafana: Internal hostname for service communication container_name: Grafana hostname: grafana # NETWORK CONFIGURATION: # - grafana-net: Isolated network for Grafana and data sources # - Allows secure communication with Prometheus # - Prevents unauthorized access to monitoring data networks: - grafana-net # RESOURCE ALLOCATION: # - mem_limit: 512MB (sufficient for dashboards and queries) # - cpu_shares: 512 (medium priority, less than Prometheus) # - Grafana is lightweight but needs memory for dashboard rendering mem_limit: 512m cpu_shares: 512 # SECURITY CONFIGURATION: # - no-new-privileges: Prevents privilege escalation attacks # - user: 1026:100 (Synology user/group for file permissions) # - CRITICAL: Must match NAS permissions for data access security_opt: - no-new-privileges:true user: 1026:100 # HEALTH MONITORING: # - wget: Tests Grafana API health endpoint # - /api/health: Built-in Grafana health check # - Ensures web interface is responsive healthcheck: test: wget --no-verbose --tries=1 --spider http://localhost:3000/api/health # NETWORK PORTS: # - 7099:3000: External port 7099 maps to internal Grafana port 3000 # - Port 7099: Accessible via reverse proxy or direct access # - Port 3000: Standard Grafana web interface port ports: - 7099:3000 # DATA PERSISTENCE: # - /volume2/metadata/docker/grafana/data: Grafana configuration and data # - Contains: Dashboards, data sources, users, alerts, plugins # - BACKUP CRITICAL: Contains all dashboard configurations # - Volume2: Separate from Volume1 for redundancy volumes: - /volume2/metadata/docker/grafana/data:/var/lib/grafana:rw environment: # TIMEZONE CONFIGURATION: # - TZ: Timezone for logs and dashboard timestamps # - Must match system timezone for accurate time series data TZ: America/Los_Angeles # PLUGIN INSTALLATION: # - GF_INSTALL_PLUGINS: Comma-separated list of plugins to install # - grafana-clock-panel: Clock widget for dashboards # - grafana-simple-json-datasource: JSON data source support # - natel-discrete-panel: Discrete value visualization # - grafana-piechart-panel: Pie chart visualizations # - Plugins installed automatically on container start GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-simple-json-datasource,natel-discrete-panel,grafana-piechart-panel # RESTART POLICY: # - on-failure:5: Restart up to 5 times on failure # - Critical for maintaining monitoring visibility # - Prevents infinite restart loops restart: on-failure:5 # ========================================================================== # PROMETHEUS - Metrics Collection and Time Series Database # ========================================================================== prometheus: # CONTAINER IMAGE: # - prom/prometheus: Official Prometheus image # - Latest stable version with security updates # - Consider version pinning: prom/prometheus:v2.47.0 image: prom/prometheus # PROMETHEUS CONFIGURATION: # - --storage.tsdb.retention.time=60d: Keep metrics for 60 days # - --config.file: Path to Prometheus configuration file # - Retention period balances storage usage vs. historical data command: - '--storage.tsdb.retention.time=60d' - '--config.file=/etc/prometheus/prometheus.yml' # CONTAINER IDENTIFICATION: # - Prometheus: Clear identification for monitoring # - prometheus-server: Internal hostname for service communication container_name: Prometheus hostname: prometheus-server # NETWORK CONFIGURATION: # - grafana-net: Communication with Grafana for data queries # - prometheus-net: Communication with exporters and targets # - Dual network setup for security and organization networks: - grafana-net - prometheus-net # RESOURCE ALLOCATION: # - mem_limit: 1GB (metrics database requires significant memory) # - cpu_shares: 768 (high priority for metrics collection) # - Memory usage scales with number of metrics and retention period mem_limit: 1g cpu_shares: 768 # SECURITY CONFIGURATION: # - no-new-privileges: Prevents privilege escalation # - user: 1026:100 (Synology permissions for data storage) security_opt: - no-new-privileges=true user: 1026:100 # HEALTH MONITORING: # - wget: Tests Prometheus web interface availability # - Port 9090: Standard Prometheus web UI port # - Ensures metrics collection is operational healthcheck: test: wget --no-verbose --tries=1 --spider http://localhost:9090/ || exit 1 # DATA PERSISTENCE: # - /volume2/metadata/docker/grafana/prometheus: Time series database storage # - /volume2/metadata/docker/grafana/prometheus.yml: Configuration file # - BACKUP IMPORTANT: Contains historical metrics data # - Configuration file defines scrape targets and rules volumes: - /volume2/metadata/docker/grafana/prometheus:/prometheus:rw - /volume2/metadata/docker/grafana/prometheus.yml:/etc/prometheus/prometheus.yml:ro # RESTART POLICY: # - on-failure:5: Restart on failure to maintain metrics collection # - Critical for continuous monitoring and alerting restart: on-failure:5 node-exporter: image: prom/node-exporter:latest command: - --collector.disable-defaults - --collector.stat - --collector.time - --collector.cpu - --collector.loadavg - --collector.hwmon - --collector.meminfo - --collector.diskstats container_name: Prometheus-Node hostname: prometheus-node networks: - prometheus-net mem_limit: 256m mem_reservation: 64m cpu_shares: 512 security_opt: - no-new-privileges=true read_only: true user: 1026:100 healthcheck: test: wget --no-verbose --tries=1 --spider http://localhost:9100/ restart: on-failure:5 snmp-exporter: image: prom/snmp-exporter:latest command: - '--config.file=/etc/snmp_exporter/snmp.yml' container_name: Prometheus-SNMP hostname: prometheus-snmp networks: - prometheus-net mem_limit: 256m mem_reservation: 64m cpu_shares: 512 security_opt: - no-new-privileges:true read_only: true user: 1026:100 healthcheck: test: wget --no-verbose --tries=1 --spider http://localhost:9116/ || exit 1 volumes: - /volume2/metadata/docker/grafana/snmp:/etc/snmp_exporter/:ro restart: on-failure:5 cadvisor: image: gcr.io/cadvisor/cadvisor:latest command: - '--docker_only=true' container_name: Prometheus-cAdvisor hostname: prometheus-cadvisor networks: - prometheus-net mem_limit: 256m mem_reservation: 64m cpu_shares: 512 security_opt: - no-new-privileges=true read_only: true volumes: - /:/rootfs:ro - /var/run:/var/run:ro - /sys:/sys:ro - /var/run/docker.sock:/var/run/docker.sock:ro restart: on-failure:5 blackbox-exporter: image: prom/blackbox-exporter container_name: blackbox-exporter networks: - prometheus-net ports: - 9115:9115 restart: unless-stopped speedtest-exporter: image: miguelndecarvalho/speedtest-exporter container_name: speedtest-exporter networks: - prometheus-net ports: - 9798:9798 restart: unless-stopped networks: grafana-net: name: grafana-net ipam: config: - subnet: 192.168.50.0/24 prometheus-net: name: prometheus-net ipam: config: - subnet: 192.168.51.0/24