Sanitized mirror from private repository - 2026-04-18 11:19:59 UTC

2026-04-18 11:19:59 +00:00
commit fb00a325d1
1418 changed files with 359990 additions and 0 deletions
--- a/ansible/automation/playbooks/service_health_deep.yml
+++ b/ansible/automation/playbooks/service_health_deep.yml
@@ -0,0 +1,524 @@
+---
+# Deep Service Health Check Playbook
+# Comprehensive health monitoring for all homelab services
+# Usage: ansible-playbook playbooks/service_health_deep.yml
+# Usage: ansible-playbook playbooks/service_health_deep.yml -e "include_performance=true"
+# Usage: ansible-playbook playbooks/service_health_deep.yml -e "alert_on_issues=true"
+
+- name: Deep Service Health Check
+  hosts: "{{ host_target | default('all') }}"
+  gather_facts: yes
+  vars:
+    include_performance: "{{ include_performance | default(true) }}"
+    alert_on_issues: "{{ alert_on_issues | default(false) }}"
+    health_check_timeout: "{{ health_check_timeout | default(30) }}"
+    report_dir: "/tmp/health_reports"
+
+    # Service health check configurations
+    service_health_checks:
+      atlantis:
+        - name: "plex"
+          container: "plex"
+          health_url: "http://localhost:32400/web"
+          expected_status: 200
+          critical: true
+        - name: "immich-server"
+          container: "immich-server"
+          health_url: "http://localhost:2283/api/server-info/ping"
+          expected_status: 200
+          critical: true
+        - name: "vaultwarden"
+          container: "vaultwarden"
+          health_url: "http://localhost:80/alive"
+          expected_status: 200
+          critical: true
+        - name: "sonarr"
+          container: "sonarr"
+          health_url: "http://localhost:8989/api/v3/system/status"
+          expected_status: 200
+          critical: false
+        - name: "radarr"
+          container: "radarr"
+          health_url: "http://localhost:7878/api/v3/system/status"
+          expected_status: 200
+          critical: false
+      calypso:
+        - name: "authentik-server"
+          container: "authentik-server"
+          health_url: "http://localhost:9000/-/health/live/"
+          expected_status: 200
+          critical: true
+        - name: "paperless-webserver"
+          container: "paperless-webserver"
+          health_url: "http://localhost:8000"
+          expected_status: 200
+          critical: false
+      homelab_vm:
+        - name: "grafana"
+          container: "grafana"
+          health_url: "http://localhost:3000/api/health"
+          expected_status: 200
+          critical: true
+        - name: "prometheus"
+          container: "prometheus"
+          health_url: "http://localhost:9090/-/healthy"
+          expected_status: 200
+          critical: true
+
+  tasks:
+    - name: Create health report directory
+      file:
+        path: "{{ report_dir }}/{{ ansible_date_time.date }}"
+        state: directory
+        mode: '0755'
+      delegate_to: localhost
+
+    - name: Get current service health checks for this host
+      set_fact:
+        current_health_checks: "{{ service_health_checks.get(inventory_hostname, []) }}"
+
+    - name: Display health check plan
+      debug:
+        msg: |
+          🏥 DEEP HEALTH CHECK PLAN
+          =========================
+          🖥️  Host: {{ inventory_hostname }}
+          📅 Date: {{ ansible_date_time.date }}
+          🔍 Services to check: {{ current_health_checks | length }}
+          📊 Include Performance: {{ include_performance }}
+          🚨 Alert on Issues: {{ alert_on_issues }}
+          ⏱️  Timeout: {{ health_check_timeout }}s
+
+          📋 Services:
+          {% for service in current_health_checks %}
+          - {{ service.name }} ({{ 'Critical' if service.critical else 'Non-critical' }})
+          {% endfor %}
+
+    - name: Check Docker daemon health
+      shell: |
+        echo "=== DOCKER DAEMON HEALTH ==="
+
+        # Check Docker daemon status
+        if systemctl is-active --quiet docker; then
+          echo "✅ Docker daemon: Running"
+
+          # Check Docker daemon responsiveness
+          if timeout 10 docker version >/dev/null 2>&1; then
+            echo "✅ Docker API: Responsive"
+          else
+            echo "❌ Docker API: Unresponsive"
+          fi
+
+          # Check Docker disk usage
+          docker_usage=$(docker system df --format "table {{.Type}}\t{{.TotalCount}}\t{{.Size}}\t{{.Reclaimable}}")
+          echo "📊 Docker Usage:"
+          echo "$docker_usage"
+
+        else
+          echo "❌ Docker daemon: Not running"
+        fi
+      register: docker_health
+      changed_when: false
+
+    - name: Check container health status
+      shell: |
+        echo "=== CONTAINER HEALTH STATUS ==="
+
+        health_issues=()
+        total_containers=0
+        healthy_containers=0
+
+        {% for service in current_health_checks %}
+        echo "🔍 Checking {{ service.name }}..."
+        total_containers=$((total_containers + 1))
+
+        # Check if container exists and is running
+        if docker ps --filter "name={{ service.container }}" --format "{{.Names}}" | grep -q "{{ service.container }}"; then
+          echo "  ✅ Container running: {{ service.container }}"
+
+          # Check container health if health check is configured
+          health_status=$(docker inspect {{ service.container }} --format='{{.State.Health.Status}}' 2>/dev/null || echo "none")
+          if [ "$health_status" != "none" ]; then
+            if [ "$health_status" = "healthy" ]; then
+              echo "  ✅ Health check: $health_status"
+              healthy_containers=$((healthy_containers + 1))
+            else
+              echo "  ❌ Health check: $health_status"
+              health_issues+=("{{ service.name }}:health_check_failed")
+            fi
+          else
+            echo "  ℹ️  No health check configured"
+            healthy_containers=$((healthy_containers + 1))  # Assume healthy if no health check
+          fi
+
+          # Check container resource usage
+          container_stats=$(docker stats {{ service.container }} --no-stream --format "CPU: {{.CPUPerc}}, Memory: {{.MemUsage}}" 2>/dev/null || echo "Stats unavailable")
+          echo "  📊 Resources: $container_stats"
+
+        else
+          echo "  ❌ Container not running: {{ service.container }}"
+          health_issues+=("{{ service.name }}:container_down")
+        fi
+        echo ""
+        {% endfor %}
+
+        echo "📊 CONTAINER SUMMARY:"
+        echo "Total containers checked: $total_containers"
+        echo "Healthy containers: $healthy_containers"
+        echo "Issues found: ${#health_issues[@]}"
+
+        if [ ${#health_issues[@]} -gt 0 ]; then
+          echo "🚨 ISSUES:"
+          for issue in "${health_issues[@]}"; do
+            echo "  - $issue"
+          done
+        fi
+      register: container_health
+      changed_when: false
+
+    - name: Test service endpoints
+      shell: |
+        echo "=== SERVICE ENDPOINT HEALTH ==="
+
+        endpoint_issues=()
+        total_endpoints=0
+        healthy_endpoints=0
+
+        {% for service in current_health_checks %}
+        {% if service.health_url is defined %}
+        echo "🌐 Testing {{ service.name }} endpoint..."
+        total_endpoints=$((total_endpoints + 1))
+
+        # Test HTTP endpoint
+        response_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time {{ health_check_timeout }} "{{ service.health_url }}" 2>/dev/null || echo "000")
+        response_time=$(curl -s -o /dev/null -w "%{time_total}" --max-time {{ health_check_timeout }} "{{ service.health_url }}" 2>/dev/null || echo "timeout")
+
+        if [ "$response_code" = "{{ service.expected_status }}" ]; then
+          echo "  ✅ HTTP $response_code (${response_time}s): {{ service.health_url }}"
+          healthy_endpoints=$((healthy_endpoints + 1))
+        else
+          echo "  ❌ HTTP $response_code (expected {{ service.expected_status }}): {{ service.health_url }}"
+          endpoint_issues+=("{{ service.name }}:http_$response_code")
+        fi
+        {% endif %}
+        {% endfor %}
+
+        echo ""
+        echo "📊 ENDPOINT SUMMARY:"
+        echo "Total endpoints tested: $total_endpoints"
+        echo "Healthy endpoints: $healthy_endpoints"
+        echo "Issues found: ${#endpoint_issues[@]}"
+
+        if [ ${#endpoint_issues[@]} -gt 0 ]; then
+          echo "🚨 ENDPOINT ISSUES:"
+          for issue in "${endpoint_issues[@]}"; do
+            echo "  - $issue"
+          done
+        fi
+      register: endpoint_health
+      changed_when: false
+
+    - name: Check system resources and performance
+      shell: |
+        echo "=== SYSTEM PERFORMANCE ==="
+
+        # CPU usage
+        cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
+        echo "🖥️  CPU Usage: ${cpu_usage}%"
+
+        # Memory usage
+        memory_info=$(free -h | awk 'NR==2{printf "Used: %s/%s (%.1f%%)", $3, $2, $3*100/$2}')
+        echo "💾 Memory: $memory_info"
+
+        # Disk usage for critical paths
+        echo "💿 Disk Usage:"
+        df -h / | tail -1 | awk '{printf "  Root: %s used (%s)\n", $5, $4}'
+
+        {% if inventory_hostname in ['atlantis', 'calypso'] %}
+        # Synology specific checks
+        if [ -d "/volume1" ]; then
+          df -h /volume1 | tail -1 | awk '{printf "  Volume1: %s used (%s)\n", $5, $4}'
+        fi
+        {% endif %}
+
+        # Load average
+        load_avg=$(uptime | awk -F'load average:' '{print $2}')
+        echo "⚖️  Load Average:$load_avg"
+
+        # Network connectivity
+        echo "🌐 Network:"
+        if ping -c 1 8.8.8.8 >/dev/null 2>&1; then
+          echo "  ✅ Internet connectivity"
+        else
+          echo "  ❌ Internet connectivity failed"
+        fi
+
+        # Tailscale status
+        if command -v tailscale >/dev/null 2>&1; then
+          tailscale_status=$(tailscale status --json 2>/dev/null | jq -r '.Self.Online' 2>/dev/null || echo "unknown")
+          if [ "$tailscale_status" = "true" ]; then
+            echo "  ✅ Tailscale connected"
+          else
+            echo "  ❌ Tailscale status: $tailscale_status"
+          fi
+        fi
+      register: system_performance
+      when: include_performance | bool
+      changed_when: false
+
+    - name: Check critical service dependencies
+      shell: |
+        echo "=== SERVICE DEPENDENCIES ==="
+
+        dependency_issues=()
+
+        # Check database connections for services that need them
+        {% for service in current_health_checks %}
+        {% if service.name in ['immich-server', 'vaultwarden', 'authentik-server', 'paperless-webserver'] %}
+        echo "🔍 Checking {{ service.name }} database dependency..."
+
+        # Try to find associated database container
+        db_container=""
+        case "{{ service.name }}" in
+          "immich-server") db_container="immich-db" ;;
+          "vaultwarden") db_container="vaultwarden-db" ;;
+          "authentik-server") db_container="authentik-db" ;;
+          "paperless-webserver") db_container="paperless-db" ;;
+        esac
+
+        if [ -n "$db_container" ]; then
+          if docker ps --filter "name=$db_container" --format "{{.Names}}" | grep -q "$db_container"; then
+            echo "  ✅ Database container running: $db_container"
+
+            # Test database connection
+            if docker exec "$db_container" pg_isready >/dev/null 2>&1; then
+              echo "  ✅ Database accepting connections"
+            else
+              echo "  ❌ Database not accepting connections"
+              dependency_issues+=("{{ service.name }}:database_connection")
+            fi
+          else
+            echo "  ❌ Database container not running: $db_container"
+            dependency_issues+=("{{ service.name }}:database_down")
+          fi
+        fi
+        {% endif %}
+        {% endfor %}
+
+        # Check Redis dependencies
+        {% for service in current_health_checks %}
+        {% if service.name in ['immich-server'] %}
+        echo "🔍 Checking {{ service.name }} Redis dependency..."
+
+        redis_container=""
+        case "{{ service.name }}" in
+          "immich-server") redis_container="immich-redis" ;;
+        esac
+
+        if [ -n "$redis_container" ]; then
+          if docker ps --filter "name=$redis_container" --format "{{.Names}}" | grep -q "$redis_container"; then
+            echo "  ✅ Redis container running: $redis_container"
+
+            # Test Redis connection
+            if docker exec "$redis_container" redis-cli ping | grep -q "PONG"; then
+              echo "  ✅ Redis responding to ping"
+            else
+              echo "  ❌ Redis not responding"
+              dependency_issues+=("{{ service.name }}:redis_connection")
+            fi
+          else
+            echo "  ❌ Redis container not running: $redis_container"
+            dependency_issues+=("{{ service.name }}:redis_down")
+          fi
+        fi
+        {% endif %}
+        {% endfor %}
+
+        echo ""
+        echo "📊 DEPENDENCY SUMMARY:"
+        echo "Issues found: ${#dependency_issues[@]}"
+
+        if [ ${#dependency_issues[@]} -gt 0 ]; then
+          echo "🚨 DEPENDENCY ISSUES:"
+          for issue in "${dependency_issues[@]}"; do
+            echo "  - $issue"
+          done
+        fi
+      register: dependency_health
+      changed_when: false
+
+    - name: Analyze service logs for errors
+      shell: |
+        echo "=== SERVICE LOG ANALYSIS ==="
+
+        log_issues=()
+
+        {% for service in current_health_checks %}
+        echo "📝 Analyzing {{ service.name }} logs..."
+
+        if docker ps --filter "name={{ service.container }}" --format "{{.Names}}" | grep -q "{{ service.container }}"; then
+          # Get recent logs and check for errors
+          error_count=$(docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(error|exception|failed|fatal|panic)" | wc -l)
+          warn_count=$(docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(warn|warning)" | wc -l)
+
+          echo "  Errors (1h): $error_count"
+          echo "  Warnings (1h): $warn_count"
+
+          if [ $error_count -gt 10 ]; then
+            echo "  ⚠️  High error count detected"
+            log_issues+=("{{ service.name }}:high_error_count:$error_count")
+          elif [ $error_count -gt 0 ]; then
+            echo "  ℹ️  Some errors detected"
+          else
+            echo "  ✅ No errors in recent logs"
+          fi
+
+          # Show recent critical errors
+          if [ $error_count -gt 0 ]; then
+            echo "  Recent errors:"
+            docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(error|exception|failed|fatal|panic)" | tail -3 | sed 's/^/    /'
+          fi
+        else
+          echo "  ❌ Container not running"
+        fi
+        echo ""
+        {% endfor %}
+
+        echo "📊 LOG ANALYSIS SUMMARY:"
+        echo "Issues found: ${#log_issues[@]}"
+
+        if [ ${#log_issues[@]} -gt 0 ]; then
+          echo "🚨 LOG ISSUES:"
+          for issue in "${log_issues[@]}"; do
+            echo "  - $issue"
+          done
+        fi
+      register: log_analysis
+      changed_when: false
+
+    - name: Generate comprehensive health report
+      copy:
+        content: |
+          🏥 DEEP SERVICE HEALTH REPORT - {{ inventory_hostname }}
+          =====================================================
+
+          📅 Health Check Date: {{ ansible_date_time.iso8601 }}
+          🖥️  Host: {{ inventory_hostname }}
+          📊 Services Checked: {{ current_health_checks | length }}
+          ⏱️  Check Timeout: {{ health_check_timeout }}s
+
+          🐳 DOCKER DAEMON HEALTH:
+          {{ docker_health.stdout }}
+
+          📦 CONTAINER HEALTH:
+          {{ container_health.stdout }}
+
+          🌐 ENDPOINT HEALTH:
+          {{ endpoint_health.stdout }}
+
+          {% if include_performance %}
+          📊 SYSTEM PERFORMANCE:
+          {{ system_performance.stdout }}
+          {% endif %}
+
+          🔗 SERVICE DEPENDENCIES:
+          {{ dependency_health.stdout }}
+
+          📝 LOG ANALYSIS:
+          {{ log_analysis.stdout }}
+
+          🎯 CRITICAL SERVICES STATUS:
+          {% for service in current_health_checks %}
+          {% if service.critical %}
+          - {{ service.name }}: {% if service.container in container_health.stdout %}✅ Running{% else %}❌ Issues{% endif %}
+          {% endif %}
+          {% endfor %}
+
+          💡 RECOMMENDATIONS:
+          {% if 'Issues found: 0' not in container_health.stdout %}
+          - 🚨 Address container issues immediately
+          {% endif %}
+          {% if 'Issues found: 0' not in endpoint_health.stdout %}
+          - 🌐 Check service endpoint connectivity
+          {% endif %}
+          {% if 'Issues found: 0' not in dependency_health.stdout %}
+          - 🔗 Resolve service dependency issues
+          {% endif %}
+          - 📊 Monitor resource usage trends
+          - 🔄 Schedule regular health checks
+          - 📝 Set up log monitoring alerts
+
+          ✅ HEALTH CHECK COMPLETE
+
+        dest: "{{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_report.txt"
+      delegate_to: localhost
+
+    - name: Create health status JSON for automation
+      copy:
+        content: |
+          {
+            "timestamp": "{{ ansible_date_time.iso8601 }}",
+            "hostname": "{{ inventory_hostname }}",
+            "health_check_summary": {
+              "total_services": {{ current_health_checks | length }},
+              "critical_services": {{ current_health_checks | selectattr('critical', 'equalto', true) | list | length }},
+              "docker_healthy": {{ 'true' if 'Docker daemon: Running' in docker_health.stdout else 'false' }},
+              "overall_status": "{% if 'Issues found: 0' in container_health.stdout and 'Issues found: 0' in endpoint_health.stdout %}HEALTHY{% else %}ISSUES_DETECTED{% endif %}"
+            },
+            "services": [
+              {% for service in current_health_checks %}
+              {
+                "name": "{{ service.name }}",
+                "container": "{{ service.container }}",
+                "critical": {{ service.critical | lower }},
+                "status": "{% if service.container in container_health.stdout %}running{% else %}down{% endif %}"
+              }{% if not loop.last %},{% endif %}
+              {% endfor %}
+            ]
+          }
+        dest: "{{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_status.json"
+      delegate_to: localhost
+
+    - name: Display health check summary
+      debug:
+        msg: |
+
+          🏥 DEEP HEALTH CHECK COMPLETE - {{ inventory_hostname }}
+          ===============================================
+
+          📅 Date: {{ ansible_date_time.date }}
+          📊 Services: {{ current_health_checks | length }}
+
+          🎯 CRITICAL SERVICES:
+          {% for service in current_health_checks %}
+          {% if service.critical %}
+          - {{ service.name }}: {% if service.container in container_health.stdout %}✅ OK{% else %}❌ ISSUES{% endif %}
+          {% endif %}
+          {% endfor %}
+
+          📊 SUMMARY:
+          - Docker: {{ '✅ Healthy' if 'Docker daemon: Running' in docker_health.stdout else '❌ Issues' }}
+          - Containers: {{ '✅ All OK' if 'Issues found: 0' in container_health.stdout else '⚠️ Issues Found' }}
+          - Endpoints: {{ '✅ All OK' if 'Issues found: 0' in endpoint_health.stdout else '⚠️ Issues Found' }}
+          - Dependencies: {{ '✅ All OK' if 'Issues found: 0' in dependency_health.stdout else '⚠️ Issues Found' }}
+
+          📄 Reports:
+          - {{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_report.txt
+          - {{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_status.json
+
+          🔍 Next Steps:
+          - Review detailed report for specific issues
+          - Address any critical service problems
+          - Schedule regular health monitoring
+
+          ===============================================
+
+    - name: Send health alerts (if issues detected)
+      debug:
+        msg: |
+          🚨 HEALTH ALERT - {{ inventory_hostname }}
+          Critical issues detected in service health check!
+          Check the detailed report immediately.
+      when:
+        - alert_on_issues | bool
+        - "'ISSUES_DETECTED' in lookup('file', report_dir + '/' + ansible_date_time.date + '/' + inventory_hostname + '_health_status.json')"