279 lines
9.4 KiB
YAML
279 lines
9.4 KiB
YAML
# =============================================================================
|
|
# HOMELAB MONITORING STACK - CRITICAL INFRASTRUCTURE VISIBILITY
|
|
# =============================================================================
|
|
#
|
|
# SERVICE OVERVIEW:
|
|
# - Complete monitoring solution for homelab infrastructure
|
|
# - Grafana: Visualization and dashboards
|
|
# - Prometheus: Metrics collection and storage
|
|
# - Node Exporter: System metrics (CPU, memory, disk, network)
|
|
# - SNMP Exporter: Network device monitoring (router, switches)
|
|
# - cAdvisor: Container metrics and resource usage
|
|
# - Blackbox Exporter: Service availability and response times
|
|
# - Speedtest Exporter: Internet connection monitoring
|
|
#
|
|
# DISASTER RECOVERY PRIORITY: HIGH
|
|
# - Essential for infrastructure visibility during outages
|
|
# - Contains historical performance data
|
|
# - Critical for troubleshooting and capacity planning
|
|
#
|
|
# RECOVERY TIME OBJECTIVE (RTO): 30 minutes
|
|
# RECOVERY POINT OBJECTIVE (RPO): 4 hours (metrics retention)
|
|
#
|
|
# DEPENDENCIES:
|
|
# - Volume2 for data persistence (separate from Volume1)
|
|
# - Network access to all monitored systems
|
|
# - SNMP access to network devices
|
|
# - Docker socket access for container monitoring
|
|
#
|
|
# =============================================================================
|
|
|
|
version: '3'
|
|
|
|
services:
|
|
# ==========================================================================
|
|
# GRAFANA - Visualization and Dashboard Platform
|
|
# ==========================================================================
|
|
grafana:
|
|
# CONTAINER IMAGE:
|
|
# - grafana/grafana:latest: Official Grafana image
|
|
# - Consider pinning version for production: grafana/grafana:10.2.0
|
|
# - Auto-updates with Watchtower (monitor for breaking changes)
|
|
image: grafana/grafana:latest
|
|
|
|
# CONTAINER IDENTIFICATION:
|
|
# - Grafana: Clear identification for monitoring and logs
|
|
# - grafana: Internal hostname for service communication
|
|
container_name: Grafana
|
|
hostname: grafana
|
|
|
|
# NETWORK CONFIGURATION:
|
|
# - grafana-net: Isolated network for Grafana and data sources
|
|
# - Allows secure communication with Prometheus
|
|
# - Prevents unauthorized access to monitoring data
|
|
networks:
|
|
- grafana-net
|
|
|
|
# RESOURCE ALLOCATION:
|
|
# - mem_limit: 512MB (sufficient for dashboards and queries)
|
|
# - cpu_shares: 512 (medium priority, less than Prometheus)
|
|
# - Grafana is lightweight but needs memory for dashboard rendering
|
|
mem_limit: 512m
|
|
cpu_shares: 512
|
|
|
|
# SECURITY CONFIGURATION:
|
|
# - no-new-privileges: Prevents privilege escalation attacks
|
|
# - user: 1026:100 (Synology user/group for file permissions)
|
|
# - CRITICAL: Must match NAS permissions for data access
|
|
security_opt:
|
|
- no-new-privileges:true
|
|
user: 1026:100
|
|
|
|
# HEALTH MONITORING:
|
|
# - wget: Tests Grafana API health endpoint
|
|
# - /api/health: Built-in Grafana health check
|
|
# - Ensures web interface is responsive
|
|
healthcheck:
|
|
test: wget --no-verbose --tries=1 --spider http://localhost:3000/api/health
|
|
|
|
# NETWORK PORTS:
|
|
# - 7099:3000: External port 7099 maps to internal Grafana port 3000
|
|
# - Port 7099: Accessible via reverse proxy or direct access
|
|
# - Port 3000: Standard Grafana web interface port
|
|
ports:
|
|
- 7099:3000
|
|
|
|
# DATA PERSISTENCE:
|
|
# - /volume2/metadata/docker/grafana/data: Grafana configuration and data
|
|
# - Contains: Dashboards, data sources, users, alerts, plugins
|
|
# - BACKUP CRITICAL: Contains all dashboard configurations
|
|
# - Volume2: Separate from Volume1 for redundancy
|
|
volumes:
|
|
- /volume2/metadata/docker/grafana/data:/var/lib/grafana:rw
|
|
|
|
environment:
|
|
# TIMEZONE CONFIGURATION:
|
|
# - TZ: Timezone for logs and dashboard timestamps
|
|
# - Must match system timezone for accurate time series data
|
|
TZ: America/Los_Angeles
|
|
|
|
# PLUGIN INSTALLATION:
|
|
# - GF_INSTALL_PLUGINS: Comma-separated list of plugins to install
|
|
# - grafana-clock-panel: Clock widget for dashboards
|
|
# - grafana-simple-json-datasource: JSON data source support
|
|
# - natel-discrete-panel: Discrete value visualization
|
|
# - grafana-piechart-panel: Pie chart visualizations
|
|
# - Plugins installed automatically on container start
|
|
GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-simple-json-datasource,natel-discrete-panel,grafana-piechart-panel
|
|
|
|
# RESTART POLICY:
|
|
# - on-failure:5: Restart up to 5 times on failure
|
|
# - Critical for maintaining monitoring visibility
|
|
# - Prevents infinite restart loops
|
|
restart: on-failure:5
|
|
|
|
# ==========================================================================
|
|
# PROMETHEUS - Metrics Collection and Time Series Database
|
|
# ==========================================================================
|
|
prometheus:
|
|
# CONTAINER IMAGE:
|
|
# - prom/prometheus: Official Prometheus image
|
|
# - Latest stable version with security updates
|
|
# - Consider version pinning: prom/prometheus:v2.47.0
|
|
image: prom/prometheus
|
|
|
|
# PROMETHEUS CONFIGURATION:
|
|
# - --storage.tsdb.retention.time=60d: Keep metrics for 60 days
|
|
# - --config.file: Path to Prometheus configuration file
|
|
# - Retention period balances storage usage vs. historical data
|
|
command:
|
|
- '--storage.tsdb.retention.time=60d'
|
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
|
|
|
# CONTAINER IDENTIFICATION:
|
|
# - Prometheus: Clear identification for monitoring
|
|
# - prometheus-server: Internal hostname for service communication
|
|
container_name: Prometheus
|
|
hostname: prometheus-server
|
|
|
|
# NETWORK CONFIGURATION:
|
|
# - grafana-net: Communication with Grafana for data queries
|
|
# - prometheus-net: Communication with exporters and targets
|
|
# - Dual network setup for security and organization
|
|
networks:
|
|
- grafana-net
|
|
- prometheus-net
|
|
|
|
# RESOURCE ALLOCATION:
|
|
# - mem_limit: 1GB (metrics database requires significant memory)
|
|
# - cpu_shares: 768 (high priority for metrics collection)
|
|
# - Memory usage scales with number of metrics and retention period
|
|
mem_limit: 1g
|
|
cpu_shares: 768
|
|
|
|
# SECURITY CONFIGURATION:
|
|
# - no-new-privileges: Prevents privilege escalation
|
|
# - user: 1026:100 (Synology permissions for data storage)
|
|
security_opt:
|
|
- no-new-privileges=true
|
|
user: 1026:100
|
|
|
|
# HEALTH MONITORING:
|
|
# - wget: Tests Prometheus web interface availability
|
|
# - Port 9090: Standard Prometheus web UI port
|
|
# - Ensures metrics collection is operational
|
|
healthcheck:
|
|
test: wget --no-verbose --tries=1 --spider http://localhost:9090/ || exit 1
|
|
|
|
# DATA PERSISTENCE:
|
|
# - /volume2/metadata/docker/grafana/prometheus: Time series database storage
|
|
# - /volume2/metadata/docker/grafana/prometheus.yml: Configuration file
|
|
# - BACKUP IMPORTANT: Contains historical metrics data
|
|
# - Configuration file defines scrape targets and rules
|
|
volumes:
|
|
- /volume2/metadata/docker/grafana/prometheus:/prometheus:rw
|
|
- /volume2/metadata/docker/grafana/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
|
|
|
# RESTART POLICY:
|
|
# - on-failure:5: Restart on failure to maintain metrics collection
|
|
# - Critical for continuous monitoring and alerting
|
|
restart: on-failure:5
|
|
|
|
node-exporter:
|
|
image: prom/node-exporter:latest
|
|
command:
|
|
- --collector.disable-defaults
|
|
- --collector.stat
|
|
- --collector.time
|
|
- --collector.cpu
|
|
- --collector.loadavg
|
|
- --collector.hwmon
|
|
- --collector.meminfo
|
|
- --collector.diskstats
|
|
container_name: Prometheus-Node
|
|
hostname: prometheus-node
|
|
networks:
|
|
- prometheus-net
|
|
mem_limit: 256m
|
|
mem_reservation: 64m
|
|
cpu_shares: 512
|
|
security_opt:
|
|
- no-new-privileges=true
|
|
read_only: true
|
|
user: 1026:100
|
|
healthcheck:
|
|
test: wget --no-verbose --tries=1 --spider http://localhost:9100/
|
|
restart: on-failure:5
|
|
|
|
snmp-exporter:
|
|
image: prom/snmp-exporter:latest
|
|
command:
|
|
- '--config.file=/etc/snmp_exporter/snmp.yml'
|
|
container_name: Prometheus-SNMP
|
|
hostname: prometheus-snmp
|
|
networks:
|
|
- prometheus-net
|
|
mem_limit: 256m
|
|
mem_reservation: 64m
|
|
cpu_shares: 512
|
|
security_opt:
|
|
- no-new-privileges:true
|
|
read_only: true
|
|
user: 1026:100
|
|
healthcheck:
|
|
test: wget --no-verbose --tries=1 --spider http://localhost:9116/ || exit 1
|
|
volumes:
|
|
- /volume2/metadata/docker/grafana/snmp:/etc/snmp_exporter/:ro
|
|
restart: on-failure:5
|
|
|
|
cadvisor:
|
|
image: gcr.io/cadvisor/cadvisor:latest
|
|
command:
|
|
- '--docker_only=true'
|
|
container_name: Prometheus-cAdvisor
|
|
hostname: prometheus-cadvisor
|
|
networks:
|
|
- prometheus-net
|
|
mem_limit: 256m
|
|
mem_reservation: 64m
|
|
cpu_shares: 512
|
|
security_opt:
|
|
- no-new-privileges=true
|
|
read_only: true
|
|
volumes:
|
|
- /:/rootfs:ro
|
|
- /var/run:/var/run:ro
|
|
- /sys:/sys:ro
|
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
|
restart: on-failure:5
|
|
|
|
blackbox-exporter:
|
|
image: prom/blackbox-exporter
|
|
container_name: blackbox-exporter
|
|
networks:
|
|
- prometheus-net
|
|
ports:
|
|
- 9115:9115
|
|
restart: unless-stopped
|
|
|
|
speedtest-exporter:
|
|
image: miguelndecarvalho/speedtest-exporter
|
|
container_name: speedtest-exporter
|
|
networks:
|
|
- prometheus-net
|
|
ports:
|
|
- 9798:9798
|
|
restart: unless-stopped
|
|
|
|
networks:
|
|
grafana-net:
|
|
name: grafana-net
|
|
ipam:
|
|
config:
|
|
- subnet: 192.168.50.0/24
|
|
prometheus-net:
|
|
name: prometheus-net
|
|
ipam:
|
|
config:
|
|
- subnet: 192.168.51.0/24
|