Sanitized mirror from private repository - 2026-04-18 11:19:59 UTC
This commit is contained in:
278
hosts/synology/atlantis/grafana_prometheus/monitoring-stack.yaml
Normal file
278
hosts/synology/atlantis/grafana_prometheus/monitoring-stack.yaml
Normal file
@@ -0,0 +1,278 @@
|
||||
# =============================================================================
|
||||
# HOMELAB MONITORING STACK - CRITICAL INFRASTRUCTURE VISIBILITY
|
||||
# =============================================================================
|
||||
#
|
||||
# SERVICE OVERVIEW:
|
||||
# - Complete monitoring solution for homelab infrastructure
|
||||
# - Grafana: Visualization and dashboards
|
||||
# - Prometheus: Metrics collection and storage
|
||||
# - Node Exporter: System metrics (CPU, memory, disk, network)
|
||||
# - SNMP Exporter: Network device monitoring (router, switches)
|
||||
# - cAdvisor: Container metrics and resource usage
|
||||
# - Blackbox Exporter: Service availability and response times
|
||||
# - Speedtest Exporter: Internet connection monitoring
|
||||
#
|
||||
# DISASTER RECOVERY PRIORITY: HIGH
|
||||
# - Essential for infrastructure visibility during outages
|
||||
# - Contains historical performance data
|
||||
# - Critical for troubleshooting and capacity planning
|
||||
#
|
||||
# RECOVERY TIME OBJECTIVE (RTO): 30 minutes
|
||||
# RECOVERY POINT OBJECTIVE (RPO): 4 hours (metrics retention)
|
||||
#
|
||||
# DEPENDENCIES:
|
||||
# - Volume2 for data persistence (separate from Volume1)
|
||||
# - Network access to all monitored systems
|
||||
# - SNMP access to network devices
|
||||
# - Docker socket access for container monitoring
|
||||
#
|
||||
# =============================================================================
|
||||
|
||||
version: '3'
|
||||
|
||||
services:
|
||||
# ==========================================================================
|
||||
# GRAFANA - Visualization and Dashboard Platform
|
||||
# ==========================================================================
|
||||
grafana:
|
||||
# CONTAINER IMAGE:
|
||||
# - grafana/grafana:latest: Official Grafana image
|
||||
# - Consider pinning version for production: grafana/grafana:10.2.0
|
||||
# - Auto-updates with Watchtower (monitor for breaking changes)
|
||||
image: grafana/grafana:latest
|
||||
|
||||
# CONTAINER IDENTIFICATION:
|
||||
# - Grafana: Clear identification for monitoring and logs
|
||||
# - grafana: Internal hostname for service communication
|
||||
container_name: Grafana
|
||||
hostname: grafana
|
||||
|
||||
# NETWORK CONFIGURATION:
|
||||
# - grafana-net: Isolated network for Grafana and data sources
|
||||
# - Allows secure communication with Prometheus
|
||||
# - Prevents unauthorized access to monitoring data
|
||||
networks:
|
||||
- grafana-net
|
||||
|
||||
# RESOURCE ALLOCATION:
|
||||
# - mem_limit: 512MB (sufficient for dashboards and queries)
|
||||
# - cpu_shares: 512 (medium priority, less than Prometheus)
|
||||
# - Grafana is lightweight but needs memory for dashboard rendering
|
||||
mem_limit: 512m
|
||||
cpu_shares: 512
|
||||
|
||||
# SECURITY CONFIGURATION:
|
||||
# - no-new-privileges: Prevents privilege escalation attacks
|
||||
# - user: 1026:100 (Synology user/group for file permissions)
|
||||
# - CRITICAL: Must match NAS permissions for data access
|
||||
security_opt:
|
||||
- no-new-privileges:true
|
||||
user: 1026:100
|
||||
|
||||
# HEALTH MONITORING:
|
||||
# - wget: Tests Grafana API health endpoint
|
||||
# - /api/health: Built-in Grafana health check
|
||||
# - Ensures web interface is responsive
|
||||
healthcheck:
|
||||
test: wget --no-verbose --tries=1 --spider http://localhost:3000/api/health
|
||||
|
||||
# NETWORK PORTS:
|
||||
# - 7099:3000: External port 7099 maps to internal Grafana port 3000
|
||||
# - Port 7099: Accessible via reverse proxy or direct access
|
||||
# - Port 3000: Standard Grafana web interface port
|
||||
ports:
|
||||
- 7099:3000
|
||||
|
||||
# DATA PERSISTENCE:
|
||||
# - /volume2/metadata/docker/grafana/data: Grafana configuration and data
|
||||
# - Contains: Dashboards, data sources, users, alerts, plugins
|
||||
# - BACKUP CRITICAL: Contains all dashboard configurations
|
||||
# - Volume2: Separate from Volume1 for redundancy
|
||||
volumes:
|
||||
- /volume2/metadata/docker/grafana/data:/var/lib/grafana:rw
|
||||
|
||||
environment:
|
||||
# TIMEZONE CONFIGURATION:
|
||||
# - TZ: Timezone for logs and dashboard timestamps
|
||||
# - Must match system timezone for accurate time series data
|
||||
TZ: America/Los_Angeles
|
||||
|
||||
# PLUGIN INSTALLATION:
|
||||
# - GF_INSTALL_PLUGINS: Comma-separated list of plugins to install
|
||||
# - grafana-clock-panel: Clock widget for dashboards
|
||||
# - grafana-simple-json-datasource: JSON data source support
|
||||
# - natel-discrete-panel: Discrete value visualization
|
||||
# - grafana-piechart-panel: Pie chart visualizations
|
||||
# - Plugins installed automatically on container start
|
||||
GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-simple-json-datasource,natel-discrete-panel,grafana-piechart-panel
|
||||
|
||||
# RESTART POLICY:
|
||||
# - on-failure:5: Restart up to 5 times on failure
|
||||
# - Critical for maintaining monitoring visibility
|
||||
# - Prevents infinite restart loops
|
||||
restart: on-failure:5
|
||||
|
||||
# ==========================================================================
|
||||
# PROMETHEUS - Metrics Collection and Time Series Database
|
||||
# ==========================================================================
|
||||
prometheus:
|
||||
# CONTAINER IMAGE:
|
||||
# - prom/prometheus: Official Prometheus image
|
||||
# - Latest stable version with security updates
|
||||
# - Consider version pinning: prom/prometheus:v2.47.0
|
||||
image: prom/prometheus
|
||||
|
||||
# PROMETHEUS CONFIGURATION:
|
||||
# - --storage.tsdb.retention.time=60d: Keep metrics for 60 days
|
||||
# - --config.file: Path to Prometheus configuration file
|
||||
# - Retention period balances storage usage vs. historical data
|
||||
command:
|
||||
- '--storage.tsdb.retention.time=60d'
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
|
||||
# CONTAINER IDENTIFICATION:
|
||||
# - Prometheus: Clear identification for monitoring
|
||||
# - prometheus-server: Internal hostname for service communication
|
||||
container_name: Prometheus
|
||||
hostname: prometheus-server
|
||||
|
||||
# NETWORK CONFIGURATION:
|
||||
# - grafana-net: Communication with Grafana for data queries
|
||||
# - prometheus-net: Communication with exporters and targets
|
||||
# - Dual network setup for security and organization
|
||||
networks:
|
||||
- grafana-net
|
||||
- prometheus-net
|
||||
|
||||
# RESOURCE ALLOCATION:
|
||||
# - mem_limit: 1GB (metrics database requires significant memory)
|
||||
# - cpu_shares: 768 (high priority for metrics collection)
|
||||
# - Memory usage scales with number of metrics and retention period
|
||||
mem_limit: 1g
|
||||
cpu_shares: 768
|
||||
|
||||
# SECURITY CONFIGURATION:
|
||||
# - no-new-privileges: Prevents privilege escalation
|
||||
# - user: 1026:100 (Synology permissions for data storage)
|
||||
security_opt:
|
||||
- no-new-privileges=true
|
||||
user: 1026:100
|
||||
|
||||
# HEALTH MONITORING:
|
||||
# - wget: Tests Prometheus web interface availability
|
||||
# - Port 9090: Standard Prometheus web UI port
|
||||
# - Ensures metrics collection is operational
|
||||
healthcheck:
|
||||
test: wget --no-verbose --tries=1 --spider http://localhost:9090/ || exit 1
|
||||
|
||||
# DATA PERSISTENCE:
|
||||
# - /volume2/metadata/docker/grafana/prometheus: Time series database storage
|
||||
# - /volume2/metadata/docker/grafana/prometheus.yml: Configuration file
|
||||
# - BACKUP IMPORTANT: Contains historical metrics data
|
||||
# - Configuration file defines scrape targets and rules
|
||||
volumes:
|
||||
- /volume2/metadata/docker/grafana/prometheus:/prometheus:rw
|
||||
- /volume2/metadata/docker/grafana/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
|
||||
# RESTART POLICY:
|
||||
# - on-failure:5: Restart on failure to maintain metrics collection
|
||||
# - Critical for continuous monitoring and alerting
|
||||
restart: on-failure:5
|
||||
|
||||
node-exporter:
|
||||
image: prom/node-exporter:latest
|
||||
command:
|
||||
- --collector.disable-defaults
|
||||
- --collector.stat
|
||||
- --collector.time
|
||||
- --collector.cpu
|
||||
- --collector.loadavg
|
||||
- --collector.hwmon
|
||||
- --collector.meminfo
|
||||
- --collector.diskstats
|
||||
container_name: Prometheus-Node
|
||||
hostname: prometheus-node
|
||||
networks:
|
||||
- prometheus-net
|
||||
mem_limit: 256m
|
||||
mem_reservation: 64m
|
||||
cpu_shares: 512
|
||||
security_opt:
|
||||
- no-new-privileges=true
|
||||
read_only: true
|
||||
user: 1026:100
|
||||
healthcheck:
|
||||
test: wget --no-verbose --tries=1 --spider http://localhost:9100/
|
||||
restart: on-failure:5
|
||||
|
||||
snmp-exporter:
|
||||
image: prom/snmp-exporter:latest
|
||||
command:
|
||||
- '--config.file=/etc/snmp_exporter/snmp.yml'
|
||||
container_name: Prometheus-SNMP
|
||||
hostname: prometheus-snmp
|
||||
networks:
|
||||
- prometheus-net
|
||||
mem_limit: 256m
|
||||
mem_reservation: 64m
|
||||
cpu_shares: 512
|
||||
security_opt:
|
||||
- no-new-privileges:true
|
||||
read_only: true
|
||||
user: 1026:100
|
||||
healthcheck:
|
||||
test: wget --no-verbose --tries=1 --spider http://localhost:9116/ || exit 1
|
||||
volumes:
|
||||
- /volume2/metadata/docker/grafana/snmp:/etc/snmp_exporter/:ro
|
||||
restart: on-failure:5
|
||||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:latest
|
||||
command:
|
||||
- '--docker_only=true'
|
||||
container_name: Prometheus-cAdvisor
|
||||
hostname: prometheus-cadvisor
|
||||
networks:
|
||||
- prometheus-net
|
||||
mem_limit: 256m
|
||||
mem_reservation: 64m
|
||||
cpu_shares: 512
|
||||
security_opt:
|
||||
- no-new-privileges=true
|
||||
read_only: true
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
restart: on-failure:5
|
||||
|
||||
blackbox-exporter:
|
||||
image: prom/blackbox-exporter
|
||||
container_name: blackbox-exporter
|
||||
networks:
|
||||
- prometheus-net
|
||||
ports:
|
||||
- 9115:9115
|
||||
restart: unless-stopped
|
||||
|
||||
speedtest-exporter:
|
||||
image: miguelndecarvalho/speedtest-exporter
|
||||
container_name: speedtest-exporter
|
||||
networks:
|
||||
- prometheus-net
|
||||
ports:
|
||||
- 9798:9798
|
||||
restart: unless-stopped
|
||||
|
||||
networks:
|
||||
grafana-net:
|
||||
name: grafana-net
|
||||
ipam:
|
||||
config:
|
||||
- subnet: 192.168.50.0/24
|
||||
prometheus-net:
|
||||
name: prometheus-net
|
||||
ipam:
|
||||
config:
|
||||
- subnet: 192.168.51.0/24
|
||||
Reference in New Issue
Block a user