525 lines
20 KiB
YAML
525 lines
20 KiB
YAML
---
|
||
# Deep Service Health Check Playbook
|
||
# Comprehensive health monitoring for all homelab services
|
||
# Usage: ansible-playbook playbooks/service_health_deep.yml
|
||
# Usage: ansible-playbook playbooks/service_health_deep.yml -e "include_performance=true"
|
||
# Usage: ansible-playbook playbooks/service_health_deep.yml -e "alert_on_issues=true"
|
||
|
||
- name: Deep Service Health Check
|
||
hosts: "{{ host_target | default('all') }}"
|
||
gather_facts: yes
|
||
vars:
|
||
include_performance: "{{ include_performance | default(true) }}"
|
||
alert_on_issues: "{{ alert_on_issues | default(false) }}"
|
||
health_check_timeout: "{{ health_check_timeout | default(30) }}"
|
||
report_dir: "/tmp/health_reports"
|
||
|
||
# Service health check configurations
|
||
service_health_checks:
|
||
atlantis:
|
||
- name: "plex"
|
||
container: "plex"
|
||
health_url: "http://localhost:32400/web"
|
||
expected_status: 200
|
||
critical: true
|
||
- name: "immich-server"
|
||
container: "immich-server"
|
||
health_url: "http://localhost:2283/api/server-info/ping"
|
||
expected_status: 200
|
||
critical: true
|
||
- name: "vaultwarden"
|
||
container: "vaultwarden"
|
||
health_url: "http://localhost:80/alive"
|
||
expected_status: 200
|
||
critical: true
|
||
- name: "sonarr"
|
||
container: "sonarr"
|
||
health_url: "http://localhost:8989/api/v3/system/status"
|
||
expected_status: 200
|
||
critical: false
|
||
- name: "radarr"
|
||
container: "radarr"
|
||
health_url: "http://localhost:7878/api/v3/system/status"
|
||
expected_status: 200
|
||
critical: false
|
||
calypso:
|
||
- name: "authentik-server"
|
||
container: "authentik-server"
|
||
health_url: "http://localhost:9000/-/health/live/"
|
||
expected_status: 200
|
||
critical: true
|
||
- name: "paperless-webserver"
|
||
container: "paperless-webserver"
|
||
health_url: "http://localhost:8000"
|
||
expected_status: 200
|
||
critical: false
|
||
homelab_vm:
|
||
- name: "grafana"
|
||
container: "grafana"
|
||
health_url: "http://localhost:3000/api/health"
|
||
expected_status: 200
|
||
critical: true
|
||
- name: "prometheus"
|
||
container: "prometheus"
|
||
health_url: "http://localhost:9090/-/healthy"
|
||
expected_status: 200
|
||
critical: true
|
||
|
||
tasks:
|
||
- name: Create health report directory
|
||
file:
|
||
path: "{{ report_dir }}/{{ ansible_date_time.date }}"
|
||
state: directory
|
||
mode: '0755'
|
||
delegate_to: localhost
|
||
|
||
- name: Get current service health checks for this host
|
||
set_fact:
|
||
current_health_checks: "{{ service_health_checks.get(inventory_hostname, []) }}"
|
||
|
||
- name: Display health check plan
|
||
debug:
|
||
msg: |
|
||
🏥 DEEP HEALTH CHECK PLAN
|
||
=========================
|
||
🖥️ Host: {{ inventory_hostname }}
|
||
📅 Date: {{ ansible_date_time.date }}
|
||
🔍 Services to check: {{ current_health_checks | length }}
|
||
📊 Include Performance: {{ include_performance }}
|
||
🚨 Alert on Issues: {{ alert_on_issues }}
|
||
⏱️ Timeout: {{ health_check_timeout }}s
|
||
|
||
📋 Services:
|
||
{% for service in current_health_checks %}
|
||
- {{ service.name }} ({{ 'Critical' if service.critical else 'Non-critical' }})
|
||
{% endfor %}
|
||
|
||
- name: Check Docker daemon health
|
||
shell: |
|
||
echo "=== DOCKER DAEMON HEALTH ==="
|
||
|
||
# Check Docker daemon status
|
||
if systemctl is-active --quiet docker; then
|
||
echo "✅ Docker daemon: Running"
|
||
|
||
# Check Docker daemon responsiveness
|
||
if timeout 10 docker version >/dev/null 2>&1; then
|
||
echo "✅ Docker API: Responsive"
|
||
else
|
||
echo "❌ Docker API: Unresponsive"
|
||
fi
|
||
|
||
# Check Docker disk usage
|
||
docker_usage=$(docker system df --format "table {{.Type}}\t{{.TotalCount}}\t{{.Size}}\t{{.Reclaimable}}")
|
||
echo "📊 Docker Usage:"
|
||
echo "$docker_usage"
|
||
|
||
else
|
||
echo "❌ Docker daemon: Not running"
|
||
fi
|
||
register: docker_health
|
||
changed_when: false
|
||
|
||
- name: Check container health status
|
||
shell: |
|
||
echo "=== CONTAINER HEALTH STATUS ==="
|
||
|
||
health_issues=()
|
||
total_containers=0
|
||
healthy_containers=0
|
||
|
||
{% for service in current_health_checks %}
|
||
echo "🔍 Checking {{ service.name }}..."
|
||
total_containers=$((total_containers + 1))
|
||
|
||
# Check if container exists and is running
|
||
if docker ps --filter "name={{ service.container }}" --format "{{.Names}}" | grep -q "{{ service.container }}"; then
|
||
echo " ✅ Container running: {{ service.container }}"
|
||
|
||
# Check container health if health check is configured
|
||
health_status=$(docker inspect {{ service.container }} --format='{{.State.Health.Status}}' 2>/dev/null || echo "none")
|
||
if [ "$health_status" != "none" ]; then
|
||
if [ "$health_status" = "healthy" ]; then
|
||
echo " ✅ Health check: $health_status"
|
||
healthy_containers=$((healthy_containers + 1))
|
||
else
|
||
echo " ❌ Health check: $health_status"
|
||
health_issues+=("{{ service.name }}:health_check_failed")
|
||
fi
|
||
else
|
||
echo " ℹ️ No health check configured"
|
||
healthy_containers=$((healthy_containers + 1)) # Assume healthy if no health check
|
||
fi
|
||
|
||
# Check container resource usage
|
||
container_stats=$(docker stats {{ service.container }} --no-stream --format "CPU: {{.CPUPerc}}, Memory: {{.MemUsage}}" 2>/dev/null || echo "Stats unavailable")
|
||
echo " 📊 Resources: $container_stats"
|
||
|
||
else
|
||
echo " ❌ Container not running: {{ service.container }}"
|
||
health_issues+=("{{ service.name }}:container_down")
|
||
fi
|
||
echo ""
|
||
{% endfor %}
|
||
|
||
echo "📊 CONTAINER SUMMARY:"
|
||
echo "Total containers checked: $total_containers"
|
||
echo "Healthy containers: $healthy_containers"
|
||
echo "Issues found: ${#health_issues[@]}"
|
||
|
||
if [ ${#health_issues[@]} -gt 0 ]; then
|
||
echo "🚨 ISSUES:"
|
||
for issue in "${health_issues[@]}"; do
|
||
echo " - $issue"
|
||
done
|
||
fi
|
||
register: container_health
|
||
changed_when: false
|
||
|
||
- name: Test service endpoints
|
||
shell: |
|
||
echo "=== SERVICE ENDPOINT HEALTH ==="
|
||
|
||
endpoint_issues=()
|
||
total_endpoints=0
|
||
healthy_endpoints=0
|
||
|
||
{% for service in current_health_checks %}
|
||
{% if service.health_url is defined %}
|
||
echo "🌐 Testing {{ service.name }} endpoint..."
|
||
total_endpoints=$((total_endpoints + 1))
|
||
|
||
# Test HTTP endpoint
|
||
response_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time {{ health_check_timeout }} "{{ service.health_url }}" 2>/dev/null || echo "000")
|
||
response_time=$(curl -s -o /dev/null -w "%{time_total}" --max-time {{ health_check_timeout }} "{{ service.health_url }}" 2>/dev/null || echo "timeout")
|
||
|
||
if [ "$response_code" = "{{ service.expected_status }}" ]; then
|
||
echo " ✅ HTTP $response_code (${response_time}s): {{ service.health_url }}"
|
||
healthy_endpoints=$((healthy_endpoints + 1))
|
||
else
|
||
echo " ❌ HTTP $response_code (expected {{ service.expected_status }}): {{ service.health_url }}"
|
||
endpoint_issues+=("{{ service.name }}:http_$response_code")
|
||
fi
|
||
{% endif %}
|
||
{% endfor %}
|
||
|
||
echo ""
|
||
echo "📊 ENDPOINT SUMMARY:"
|
||
echo "Total endpoints tested: $total_endpoints"
|
||
echo "Healthy endpoints: $healthy_endpoints"
|
||
echo "Issues found: ${#endpoint_issues[@]}"
|
||
|
||
if [ ${#endpoint_issues[@]} -gt 0 ]; then
|
||
echo "🚨 ENDPOINT ISSUES:"
|
||
for issue in "${endpoint_issues[@]}"; do
|
||
echo " - $issue"
|
||
done
|
||
fi
|
||
register: endpoint_health
|
||
changed_when: false
|
||
|
||
- name: Check system resources and performance
|
||
shell: |
|
||
echo "=== SYSTEM PERFORMANCE ==="
|
||
|
||
# CPU usage
|
||
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
|
||
echo "🖥️ CPU Usage: ${cpu_usage}%"
|
||
|
||
# Memory usage
|
||
memory_info=$(free -h | awk 'NR==2{printf "Used: %s/%s (%.1f%%)", $3, $2, $3*100/$2}')
|
||
echo "💾 Memory: $memory_info"
|
||
|
||
# Disk usage for critical paths
|
||
echo "💿 Disk Usage:"
|
||
df -h / | tail -1 | awk '{printf " Root: %s used (%s)\n", $5, $4}'
|
||
|
||
{% if inventory_hostname in ['atlantis', 'calypso'] %}
|
||
# Synology specific checks
|
||
if [ -d "/volume1" ]; then
|
||
df -h /volume1 | tail -1 | awk '{printf " Volume1: %s used (%s)\n", $5, $4}'
|
||
fi
|
||
{% endif %}
|
||
|
||
# Load average
|
||
load_avg=$(uptime | awk -F'load average:' '{print $2}')
|
||
echo "⚖️ Load Average:$load_avg"
|
||
|
||
# Network connectivity
|
||
echo "🌐 Network:"
|
||
if ping -c 1 8.8.8.8 >/dev/null 2>&1; then
|
||
echo " ✅ Internet connectivity"
|
||
else
|
||
echo " ❌ Internet connectivity failed"
|
||
fi
|
||
|
||
# Tailscale status
|
||
if command -v tailscale >/dev/null 2>&1; then
|
||
tailscale_status=$(tailscale status --json 2>/dev/null | jq -r '.Self.Online' 2>/dev/null || echo "unknown")
|
||
if [ "$tailscale_status" = "true" ]; then
|
||
echo " ✅ Tailscale connected"
|
||
else
|
||
echo " ❌ Tailscale status: $tailscale_status"
|
||
fi
|
||
fi
|
||
register: system_performance
|
||
when: include_performance | bool
|
||
changed_when: false
|
||
|
||
- name: Check critical service dependencies
|
||
shell: |
|
||
echo "=== SERVICE DEPENDENCIES ==="
|
||
|
||
dependency_issues=()
|
||
|
||
# Check database connections for services that need them
|
||
{% for service in current_health_checks %}
|
||
{% if service.name in ['immich-server', 'vaultwarden', 'authentik-server', 'paperless-webserver'] %}
|
||
echo "🔍 Checking {{ service.name }} database dependency..."
|
||
|
||
# Try to find associated database container
|
||
db_container=""
|
||
case "{{ service.name }}" in
|
||
"immich-server") db_container="immich-db" ;;
|
||
"vaultwarden") db_container="vaultwarden-db" ;;
|
||
"authentik-server") db_container="authentik-db" ;;
|
||
"paperless-webserver") db_container="paperless-db" ;;
|
||
esac
|
||
|
||
if [ -n "$db_container" ]; then
|
||
if docker ps --filter "name=$db_container" --format "{{.Names}}" | grep -q "$db_container"; then
|
||
echo " ✅ Database container running: $db_container"
|
||
|
||
# Test database connection
|
||
if docker exec "$db_container" pg_isready >/dev/null 2>&1; then
|
||
echo " ✅ Database accepting connections"
|
||
else
|
||
echo " ❌ Database not accepting connections"
|
||
dependency_issues+=("{{ service.name }}:database_connection")
|
||
fi
|
||
else
|
||
echo " ❌ Database container not running: $db_container"
|
||
dependency_issues+=("{{ service.name }}:database_down")
|
||
fi
|
||
fi
|
||
{% endif %}
|
||
{% endfor %}
|
||
|
||
# Check Redis dependencies
|
||
{% for service in current_health_checks %}
|
||
{% if service.name in ['immich-server'] %}
|
||
echo "🔍 Checking {{ service.name }} Redis dependency..."
|
||
|
||
redis_container=""
|
||
case "{{ service.name }}" in
|
||
"immich-server") redis_container="immich-redis" ;;
|
||
esac
|
||
|
||
if [ -n "$redis_container" ]; then
|
||
if docker ps --filter "name=$redis_container" --format "{{.Names}}" | grep -q "$redis_container"; then
|
||
echo " ✅ Redis container running: $redis_container"
|
||
|
||
# Test Redis connection
|
||
if docker exec "$redis_container" redis-cli ping | grep -q "PONG"; then
|
||
echo " ✅ Redis responding to ping"
|
||
else
|
||
echo " ❌ Redis not responding"
|
||
dependency_issues+=("{{ service.name }}:redis_connection")
|
||
fi
|
||
else
|
||
echo " ❌ Redis container not running: $redis_container"
|
||
dependency_issues+=("{{ service.name }}:redis_down")
|
||
fi
|
||
fi
|
||
{% endif %}
|
||
{% endfor %}
|
||
|
||
echo ""
|
||
echo "📊 DEPENDENCY SUMMARY:"
|
||
echo "Issues found: ${#dependency_issues[@]}"
|
||
|
||
if [ ${#dependency_issues[@]} -gt 0 ]; then
|
||
echo "🚨 DEPENDENCY ISSUES:"
|
||
for issue in "${dependency_issues[@]}"; do
|
||
echo " - $issue"
|
||
done
|
||
fi
|
||
register: dependency_health
|
||
changed_when: false
|
||
|
||
- name: Analyze service logs for errors
|
||
shell: |
|
||
echo "=== SERVICE LOG ANALYSIS ==="
|
||
|
||
log_issues=()
|
||
|
||
{% for service in current_health_checks %}
|
||
echo "📝 Analyzing {{ service.name }} logs..."
|
||
|
||
if docker ps --filter "name={{ service.container }}" --format "{{.Names}}" | grep -q "{{ service.container }}"; then
|
||
# Get recent logs and check for errors
|
||
error_count=$(docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(error|exception|failed|fatal|panic)" | wc -l)
|
||
warn_count=$(docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(warn|warning)" | wc -l)
|
||
|
||
echo " Errors (1h): $error_count"
|
||
echo " Warnings (1h): $warn_count"
|
||
|
||
if [ $error_count -gt 10 ]; then
|
||
echo " ⚠️ High error count detected"
|
||
log_issues+=("{{ service.name }}:high_error_count:$error_count")
|
||
elif [ $error_count -gt 0 ]; then
|
||
echo " ℹ️ Some errors detected"
|
||
else
|
||
echo " ✅ No errors in recent logs"
|
||
fi
|
||
|
||
# Show recent critical errors
|
||
if [ $error_count -gt 0 ]; then
|
||
echo " Recent errors:"
|
||
docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(error|exception|failed|fatal|panic)" | tail -3 | sed 's/^/ /'
|
||
fi
|
||
else
|
||
echo " ❌ Container not running"
|
||
fi
|
||
echo ""
|
||
{% endfor %}
|
||
|
||
echo "📊 LOG ANALYSIS SUMMARY:"
|
||
echo "Issues found: ${#log_issues[@]}"
|
||
|
||
if [ ${#log_issues[@]} -gt 0 ]; then
|
||
echo "🚨 LOG ISSUES:"
|
||
for issue in "${log_issues[@]}"; do
|
||
echo " - $issue"
|
||
done
|
||
fi
|
||
register: log_analysis
|
||
changed_when: false
|
||
|
||
- name: Generate comprehensive health report
|
||
copy:
|
||
content: |
|
||
🏥 DEEP SERVICE HEALTH REPORT - {{ inventory_hostname }}
|
||
=====================================================
|
||
|
||
📅 Health Check Date: {{ ansible_date_time.iso8601 }}
|
||
🖥️ Host: {{ inventory_hostname }}
|
||
📊 Services Checked: {{ current_health_checks | length }}
|
||
⏱️ Check Timeout: {{ health_check_timeout }}s
|
||
|
||
🐳 DOCKER DAEMON HEALTH:
|
||
{{ docker_health.stdout }}
|
||
|
||
📦 CONTAINER HEALTH:
|
||
{{ container_health.stdout }}
|
||
|
||
🌐 ENDPOINT HEALTH:
|
||
{{ endpoint_health.stdout }}
|
||
|
||
{% if include_performance %}
|
||
📊 SYSTEM PERFORMANCE:
|
||
{{ system_performance.stdout }}
|
||
{% endif %}
|
||
|
||
🔗 SERVICE DEPENDENCIES:
|
||
{{ dependency_health.stdout }}
|
||
|
||
📝 LOG ANALYSIS:
|
||
{{ log_analysis.stdout }}
|
||
|
||
🎯 CRITICAL SERVICES STATUS:
|
||
{% for service in current_health_checks %}
|
||
{% if service.critical %}
|
||
- {{ service.name }}: {% if service.container in container_health.stdout %}✅ Running{% else %}❌ Issues{% endif %}
|
||
{% endif %}
|
||
{% endfor %}
|
||
|
||
💡 RECOMMENDATIONS:
|
||
{% if 'Issues found: 0' not in container_health.stdout %}
|
||
- 🚨 Address container issues immediately
|
||
{% endif %}
|
||
{% if 'Issues found: 0' not in endpoint_health.stdout %}
|
||
- 🌐 Check service endpoint connectivity
|
||
{% endif %}
|
||
{% if 'Issues found: 0' not in dependency_health.stdout %}
|
||
- 🔗 Resolve service dependency issues
|
||
{% endif %}
|
||
- 📊 Monitor resource usage trends
|
||
- 🔄 Schedule regular health checks
|
||
- 📝 Set up log monitoring alerts
|
||
|
||
✅ HEALTH CHECK COMPLETE
|
||
|
||
dest: "{{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_report.txt"
|
||
delegate_to: localhost
|
||
|
||
- name: Create health status JSON for automation
|
||
copy:
|
||
content: |
|
||
{
|
||
"timestamp": "{{ ansible_date_time.iso8601 }}",
|
||
"hostname": "{{ inventory_hostname }}",
|
||
"health_check_summary": {
|
||
"total_services": {{ current_health_checks | length }},
|
||
"critical_services": {{ current_health_checks | selectattr('critical', 'equalto', true) | list | length }},
|
||
"docker_healthy": {{ 'true' if 'Docker daemon: Running' in docker_health.stdout else 'false' }},
|
||
"overall_status": "{% if 'Issues found: 0' in container_health.stdout and 'Issues found: 0' in endpoint_health.stdout %}HEALTHY{% else %}ISSUES_DETECTED{% endif %}"
|
||
},
|
||
"services": [
|
||
{% for service in current_health_checks %}
|
||
{
|
||
"name": "{{ service.name }}",
|
||
"container": "{{ service.container }}",
|
||
"critical": {{ service.critical | lower }},
|
||
"status": "{% if service.container in container_health.stdout %}running{% else %}down{% endif %}"
|
||
}{% if not loop.last %},{% endif %}
|
||
{% endfor %}
|
||
]
|
||
}
|
||
dest: "{{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_status.json"
|
||
delegate_to: localhost
|
||
|
||
- name: Display health check summary
|
||
debug:
|
||
msg: |
|
||
|
||
🏥 DEEP HEALTH CHECK COMPLETE - {{ inventory_hostname }}
|
||
===============================================
|
||
|
||
📅 Date: {{ ansible_date_time.date }}
|
||
📊 Services: {{ current_health_checks | length }}
|
||
|
||
🎯 CRITICAL SERVICES:
|
||
{% for service in current_health_checks %}
|
||
{% if service.critical %}
|
||
- {{ service.name }}: {% if service.container in container_health.stdout %}✅ OK{% else %}❌ ISSUES{% endif %}
|
||
{% endif %}
|
||
{% endfor %}
|
||
|
||
📊 SUMMARY:
|
||
- Docker: {{ '✅ Healthy' if 'Docker daemon: Running' in docker_health.stdout else '❌ Issues' }}
|
||
- Containers: {{ '✅ All OK' if 'Issues found: 0' in container_health.stdout else '⚠️ Issues Found' }}
|
||
- Endpoints: {{ '✅ All OK' if 'Issues found: 0' in endpoint_health.stdout else '⚠️ Issues Found' }}
|
||
- Dependencies: {{ '✅ All OK' if 'Issues found: 0' in dependency_health.stdout else '⚠️ Issues Found' }}
|
||
|
||
📄 Reports:
|
||
- {{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_report.txt
|
||
- {{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_status.json
|
||
|
||
🔍 Next Steps:
|
||
- Review detailed report for specific issues
|
||
- Address any critical service problems
|
||
- Schedule regular health monitoring
|
||
|
||
===============================================
|
||
|
||
- name: Send health alerts (if issues detected)
|
||
debug:
|
||
msg: |
|
||
🚨 HEALTH ALERT - {{ inventory_hostname }}
|
||
Critical issues detected in service health check!
|
||
Check the detailed report immediately.
|
||
when:
|
||
- alert_on_issues | bool
|
||
- "'ISSUES_DETECTED' in lookup('file', report_dir + '/' + ansible_date_time.date + '/' + inventory_hostname + '_health_status.json')"
|