homelab-optimized/hosts/synology/atlantis/grafana_prometheus/monitoring-stack.yaml

# =============================================================================
# HOMELAB MONITORING STACK - CRITICAL INFRASTRUCTURE VISIBILITY
# =============================================================================
#
# SERVICE OVERVIEW:
# - Complete monitoring solution for homelab infrastructure
# - Grafana: Visualization and dashboards
# - Prometheus: Metrics collection and storage
# - Node Exporter: System metrics (CPU, memory, disk, network)
# - SNMP Exporter: Network device monitoring (router, switches)
# - cAdvisor: Container metrics and resource usage
# - Blackbox Exporter: Service availability and response times
# - Speedtest Exporter: Internet connection monitoring
#
# DISASTER RECOVERY PRIORITY: HIGH
# - Essential for infrastructure visibility during outages
# - Contains historical performance data
# - Critical for troubleshooting and capacity planning
#
# RECOVERY TIME OBJECTIVE (RTO): 30 minutes
# RECOVERY POINT OBJECTIVE (RPO): 4 hours (metrics retention)
#
# DEPENDENCIES:
# - Volume2 for data persistence (separate from Volume1)
# - Network access to all monitored systems
# - SNMP access to network devices
# - Docker socket access for container monitoring
#
# =============================================================================

version: '3'

services:
  # ==========================================================================
  # GRAFANA - Visualization and Dashboard Platform
  # ==========================================================================
  grafana:
    # CONTAINER IMAGE:
    # - grafana/grafana:latest: Official Grafana image
    # - Consider pinning version for production: grafana/grafana:10.2.0
    # - Auto-updates with Watchtower (monitor for breaking changes)
    image: grafana/grafana:latest

    # CONTAINER IDENTIFICATION:
    # - Grafana: Clear identification for monitoring and logs
    # - grafana: Internal hostname for service communication
    container_name: Grafana
    hostname: grafana

    # NETWORK CONFIGURATION:
    # - grafana-net: Isolated network for Grafana and data sources
    # - Allows secure communication with Prometheus
    # - Prevents unauthorized access to monitoring data
    networks:
      - grafana-net

    # RESOURCE ALLOCATION:
    # - mem_limit: 512MB (sufficient for dashboards and queries)
    # - cpu_shares: 512 (medium priority, less than Prometheus)
    # - Grafana is lightweight but needs memory for dashboard rendering
    mem_limit: 512m
    cpu_shares: 512

    # SECURITY CONFIGURATION:
    # - no-new-privileges: Prevents privilege escalation attacks
    # - user: 1026:100 (Synology user/group for file permissions)
    # - CRITICAL: Must match NAS permissions for data access
    security_opt:
      - no-new-privileges:true
    user: 1026:100

    # HEALTH MONITORING:
    # - wget: Tests Grafana API health endpoint
    # - /api/health: Built-in Grafana health check
    # - Ensures web interface is responsive
    healthcheck:
      test: wget --no-verbose --tries=1 --spider http://localhost:3000/api/health

    # NETWORK PORTS:
    # - 7099:3000: External port 7099 maps to internal Grafana port 3000
    # - Port 7099: Accessible via reverse proxy or direct access
    # - Port 3000: Standard Grafana web interface port
    ports:
      - 7099:3000

    # DATA PERSISTENCE:
    # - /volume2/metadata/docker/grafana/data: Grafana configuration and data
    # - Contains: Dashboards, data sources, users, alerts, plugins
    # - BACKUP CRITICAL: Contains all dashboard configurations
    # - Volume2: Separate from Volume1 for redundancy
    volumes:
      - /volume2/metadata/docker/grafana/data:/var/lib/grafana:rw

    environment:
      # TIMEZONE CONFIGURATION:
      # - TZ: Timezone for logs and dashboard timestamps
      # - Must match system timezone for accurate time series data
      TZ: America/Los_Angeles

      # PLUGIN INSTALLATION:
      # - GF_INSTALL_PLUGINS: Comma-separated list of plugins to install
      # - grafana-clock-panel: Clock widget for dashboards
      # - grafana-simple-json-datasource: JSON data source support
      # - natel-discrete-panel: Discrete value visualization
      # - grafana-piechart-panel: Pie chart visualizations
      # - Plugins installed automatically on container start
      GF_INSTALL_PLUGINS: grafana-clock-panel,grafana-simple-json-datasource,natel-discrete-panel,grafana-piechart-panel

    # RESTART POLICY:
    # - on-failure:5: Restart up to 5 times on failure
    # - Critical for maintaining monitoring visibility
    # - Prevents infinite restart loops
    restart: on-failure:5

  # ==========================================================================
  # PROMETHEUS - Metrics Collection and Time Series Database
  # ==========================================================================
  prometheus:
    # CONTAINER IMAGE:
    # - prom/prometheus: Official Prometheus image
    # - Latest stable version with security updates
    # - Consider version pinning: prom/prometheus:v2.47.0
    image: prom/prometheus

    # PROMETHEUS CONFIGURATION:
    # - --storage.tsdb.retention.time=60d: Keep metrics for 60 days
    # - --config.file: Path to Prometheus configuration file
    # - Retention period balances storage usage vs. historical data
    command:
      - '--storage.tsdb.retention.time=60d'
      - '--config.file=/etc/prometheus/prometheus.yml'

    # CONTAINER IDENTIFICATION:
    # - Prometheus: Clear identification for monitoring
    # - prometheus-server: Internal hostname for service communication
    container_name: Prometheus
    hostname: prometheus-server

    # NETWORK CONFIGURATION:
    # - grafana-net: Communication with Grafana for data queries
    # - prometheus-net: Communication with exporters and targets
    # - Dual network setup for security and organization
    networks:
      - grafana-net
      - prometheus-net

    # RESOURCE ALLOCATION:
    # - mem_limit: 1GB (metrics database requires significant memory)
    # - cpu_shares: 768 (high priority for metrics collection)
    # - Memory usage scales with number of metrics and retention period
    mem_limit: 1g
    cpu_shares: 768

    # SECURITY CONFIGURATION:
    # - no-new-privileges: Prevents privilege escalation
    # - user: 1026:100 (Synology permissions for data storage)
    security_opt:
      - no-new-privileges=true
    user: 1026:100

    # HEALTH MONITORING:
    # - wget: Tests Prometheus web interface availability
    # - Port 9090: Standard Prometheus web UI port
    # - Ensures metrics collection is operational
    healthcheck:
      test: wget --no-verbose --tries=1 --spider http://localhost:9090/ || exit 1

    # DATA PERSISTENCE:
    # - /volume2/metadata/docker/grafana/prometheus: Time series database storage
    # - /volume2/metadata/docker/grafana/prometheus.yml: Configuration file
    # - BACKUP IMPORTANT: Contains historical metrics data
    # - Configuration file defines scrape targets and rules
    volumes:
      - /volume2/metadata/docker/grafana/prometheus:/prometheus:rw
      - /volume2/metadata/docker/grafana/prometheus.yml:/etc/prometheus/prometheus.yml:ro

    # RESTART POLICY:
    # - on-failure:5: Restart on failure to maintain metrics collection
    # - Critical for continuous monitoring and alerting
    restart: on-failure:5

  node-exporter:
    image: prom/node-exporter:latest
    command:
      - --collector.disable-defaults
      - --collector.stat
      - --collector.time
      - --collector.cpu
      - --collector.loadavg
      - --collector.hwmon
      - --collector.meminfo
      - --collector.diskstats
    container_name: Prometheus-Node
    hostname: prometheus-node
    networks:
      - prometheus-net
    mem_limit: 256m
    mem_reservation: 64m
    cpu_shares: 512
    security_opt:
      - no-new-privileges=true
    read_only: true
    user: 1026:100
    healthcheck:
      test: wget --no-verbose --tries=1 --spider http://localhost:9100/
    restart: on-failure:5

  snmp-exporter:
    image: prom/snmp-exporter:latest
    command:
      - '--config.file=/etc/snmp_exporter/snmp.yml'
    container_name: Prometheus-SNMP
    hostname: prometheus-snmp
    networks:
      - prometheus-net
    mem_limit: 256m
    mem_reservation: 64m
    cpu_shares: 512
    security_opt:
      - no-new-privileges:true
    read_only: true
    user: 1026:100
    healthcheck:
      test: wget --no-verbose --tries=1 --spider http://localhost:9116/ || exit 1
    volumes:
      - /volume2/metadata/docker/grafana/snmp:/etc/snmp_exporter/:ro
    restart: on-failure:5

  cadvisor:
    image: gcr.io/cadvisor/cadvisor:latest
    command:
      - '--docker_only=true'
    container_name: Prometheus-cAdvisor
    hostname: prometheus-cadvisor
    networks:
      - prometheus-net
    mem_limit: 256m
    mem_reservation: 64m
    cpu_shares: 512
    security_opt:
      - no-new-privileges=true
    read_only: true
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:ro
      - /sys:/sys:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
    restart: on-failure:5

  blackbox-exporter:
    image: prom/blackbox-exporter
    container_name: blackbox-exporter
    networks:
      - prometheus-net
    ports:
      - 9115:9115
    restart: unless-stopped

  speedtest-exporter:
    image: miguelndecarvalho/speedtest-exporter
    container_name: speedtest-exporter
    networks:
      - prometheus-net
    ports:
      - 9798:9798
    restart: unless-stopped

networks:
  grafana-net:
    name: grafana-net
    ipam:
      config:
        - subnet: 192.168.50.0/24
  prometheus-net:
    name: prometheus-net
    ipam:
      config:
        - subnet: 192.168.51.0/24