Files
homelab-optimized/ansible/automation/playbooks/service_health_deep.yml
Gitea Mirror Bot 5cbaedc119
Some checks failed
Documentation / Build Docusaurus (push) Failing after 17m43s
Documentation / Deploy to GitHub Pages (push) Has been skipped
Sanitized mirror from private repository - 2026-03-31 12:23:18 UTC
2026-03-31 12:23:18 +00:00

525 lines
20 KiB
YAML
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
---
# Deep Service Health Check Playbook
# Comprehensive health monitoring for all homelab services
# Usage: ansible-playbook playbooks/service_health_deep.yml
# Usage: ansible-playbook playbooks/service_health_deep.yml -e "include_performance=true"
# Usage: ansible-playbook playbooks/service_health_deep.yml -e "alert_on_issues=true"
- name: Deep Service Health Check
hosts: "{{ host_target | default('all') }}"
gather_facts: yes
vars:
include_performance: "{{ include_performance | default(true) }}"
alert_on_issues: "{{ alert_on_issues | default(false) }}"
health_check_timeout: "{{ health_check_timeout | default(30) }}"
report_dir: "/tmp/health_reports"
# Service health check configurations
service_health_checks:
atlantis:
- name: "plex"
container: "plex"
health_url: "http://localhost:32400/web"
expected_status: 200
critical: true
- name: "immich-server"
container: "immich-server"
health_url: "http://localhost:2283/api/server-info/ping"
expected_status: 200
critical: true
- name: "vaultwarden"
container: "vaultwarden"
health_url: "http://localhost:80/alive"
expected_status: 200
critical: true
- name: "sonarr"
container: "sonarr"
health_url: "http://localhost:8989/api/v3/system/status"
expected_status: 200
critical: false
- name: "radarr"
container: "radarr"
health_url: "http://localhost:7878/api/v3/system/status"
expected_status: 200
critical: false
calypso:
- name: "authentik-server"
container: "authentik-server"
health_url: "http://localhost:9000/-/health/live/"
expected_status: 200
critical: true
- name: "paperless-webserver"
container: "paperless-webserver"
health_url: "http://localhost:8000"
expected_status: 200
critical: false
homelab_vm:
- name: "grafana"
container: "grafana"
health_url: "http://localhost:3000/api/health"
expected_status: 200
critical: true
- name: "prometheus"
container: "prometheus"
health_url: "http://localhost:9090/-/healthy"
expected_status: 200
critical: true
tasks:
- name: Create health report directory
file:
path: "{{ report_dir }}/{{ ansible_date_time.date }}"
state: directory
mode: '0755'
delegate_to: localhost
- name: Get current service health checks for this host
set_fact:
current_health_checks: "{{ service_health_checks.get(inventory_hostname, []) }}"
- name: Display health check plan
debug:
msg: |
🏥 DEEP HEALTH CHECK PLAN
=========================
🖥️ Host: {{ inventory_hostname }}
📅 Date: {{ ansible_date_time.date }}
🔍 Services to check: {{ current_health_checks | length }}
📊 Include Performance: {{ include_performance }}
🚨 Alert on Issues: {{ alert_on_issues }}
⏱️ Timeout: {{ health_check_timeout }}s
📋 Services:
{% for service in current_health_checks %}
- {{ service.name }} ({{ 'Critical' if service.critical else 'Non-critical' }})
{% endfor %}
- name: Check Docker daemon health
shell: |
echo "=== DOCKER DAEMON HEALTH ==="
# Check Docker daemon status
if systemctl is-active --quiet docker; then
echo "✅ Docker daemon: Running"
# Check Docker daemon responsiveness
if timeout 10 docker version >/dev/null 2>&1; then
echo "✅ Docker API: Responsive"
else
echo "❌ Docker API: Unresponsive"
fi
# Check Docker disk usage
docker_usage=$(docker system df --format "table {{.Type}}\t{{.TotalCount}}\t{{.Size}}\t{{.Reclaimable}}")
echo "📊 Docker Usage:"
echo "$docker_usage"
else
echo "❌ Docker daemon: Not running"
fi
register: docker_health
changed_when: false
- name: Check container health status
shell: |
echo "=== CONTAINER HEALTH STATUS ==="
health_issues=()
total_containers=0
healthy_containers=0
{% for service in current_health_checks %}
echo "🔍 Checking {{ service.name }}..."
total_containers=$((total_containers + 1))
# Check if container exists and is running
if docker ps --filter "name={{ service.container }}" --format "{{.Names}}" | grep -q "{{ service.container }}"; then
echo " ✅ Container running: {{ service.container }}"
# Check container health if health check is configured
health_status=$(docker inspect {{ service.container }} --format='{{.State.Health.Status}}' 2>/dev/null || echo "none")
if [ "$health_status" != "none" ]; then
if [ "$health_status" = "healthy" ]; then
echo " ✅ Health check: $health_status"
healthy_containers=$((healthy_containers + 1))
else
echo " ❌ Health check: $health_status"
health_issues+=("{{ service.name }}:health_check_failed")
fi
else
echo " No health check configured"
healthy_containers=$((healthy_containers + 1)) # Assume healthy if no health check
fi
# Check container resource usage
container_stats=$(docker stats {{ service.container }} --no-stream --format "CPU: {{.CPUPerc}}, Memory: {{.MemUsage}}" 2>/dev/null || echo "Stats unavailable")
echo " 📊 Resources: $container_stats"
else
echo " ❌ Container not running: {{ service.container }}"
health_issues+=("{{ service.name }}:container_down")
fi
echo ""
{% endfor %}
echo "📊 CONTAINER SUMMARY:"
echo "Total containers checked: $total_containers"
echo "Healthy containers: $healthy_containers"
echo "Issues found: ${#health_issues[@]}"
if [ ${#health_issues[@]} -gt 0 ]; then
echo "🚨 ISSUES:"
for issue in "${health_issues[@]}"; do
echo " - $issue"
done
fi
register: container_health
changed_when: false
- name: Test service endpoints
shell: |
echo "=== SERVICE ENDPOINT HEALTH ==="
endpoint_issues=()
total_endpoints=0
healthy_endpoints=0
{% for service in current_health_checks %}
{% if service.health_url is defined %}
echo "🌐 Testing {{ service.name }} endpoint..."
total_endpoints=$((total_endpoints + 1))
# Test HTTP endpoint
response_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time {{ health_check_timeout }} "{{ service.health_url }}" 2>/dev/null || echo "000")
response_time=$(curl -s -o /dev/null -w "%{time_total}" --max-time {{ health_check_timeout }} "{{ service.health_url }}" 2>/dev/null || echo "timeout")
if [ "$response_code" = "{{ service.expected_status }}" ]; then
echo " ✅ HTTP $response_code (${response_time}s): {{ service.health_url }}"
healthy_endpoints=$((healthy_endpoints + 1))
else
echo " ❌ HTTP $response_code (expected {{ service.expected_status }}): {{ service.health_url }}"
endpoint_issues+=("{{ service.name }}:http_$response_code")
fi
{% endif %}
{% endfor %}
echo ""
echo "📊 ENDPOINT SUMMARY:"
echo "Total endpoints tested: $total_endpoints"
echo "Healthy endpoints: $healthy_endpoints"
echo "Issues found: ${#endpoint_issues[@]}"
if [ ${#endpoint_issues[@]} -gt 0 ]; then
echo "🚨 ENDPOINT ISSUES:"
for issue in "${endpoint_issues[@]}"; do
echo " - $issue"
done
fi
register: endpoint_health
changed_when: false
- name: Check system resources and performance
shell: |
echo "=== SYSTEM PERFORMANCE ==="
# CPU usage
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
echo "🖥️ CPU Usage: ${cpu_usage}%"
# Memory usage
memory_info=$(free -h | awk 'NR==2{printf "Used: %s/%s (%.1f%%)", $3, $2, $3*100/$2}')
echo "💾 Memory: $memory_info"
# Disk usage for critical paths
echo "💿 Disk Usage:"
df -h / | tail -1 | awk '{printf " Root: %s used (%s)\n", $5, $4}'
{% if inventory_hostname in ['atlantis', 'calypso'] %}
# Synology specific checks
if [ -d "/volume1" ]; then
df -h /volume1 | tail -1 | awk '{printf " Volume1: %s used (%s)\n", $5, $4}'
fi
{% endif %}
# Load average
load_avg=$(uptime | awk -F'load average:' '{print $2}')
echo "⚖️ Load Average:$load_avg"
# Network connectivity
echo "🌐 Network:"
if ping -c 1 8.8.8.8 >/dev/null 2>&1; then
echo " ✅ Internet connectivity"
else
echo " ❌ Internet connectivity failed"
fi
# Tailscale status
if command -v tailscale >/dev/null 2>&1; then
tailscale_status=$(tailscale status --json 2>/dev/null | jq -r '.Self.Online' 2>/dev/null || echo "unknown")
if [ "$tailscale_status" = "true" ]; then
echo " ✅ Tailscale connected"
else
echo " ❌ Tailscale status: $tailscale_status"
fi
fi
register: system_performance
when: include_performance | bool
changed_when: false
- name: Check critical service dependencies
shell: |
echo "=== SERVICE DEPENDENCIES ==="
dependency_issues=()
# Check database connections for services that need them
{% for service in current_health_checks %}
{% if service.name in ['immich-server', 'vaultwarden', 'authentik-server', 'paperless-webserver'] %}
echo "🔍 Checking {{ service.name }} database dependency..."
# Try to find associated database container
db_container=""
case "{{ service.name }}" in
"immich-server") db_container="immich-db" ;;
"vaultwarden") db_container="vaultwarden-db" ;;
"authentik-server") db_container="authentik-db" ;;
"paperless-webserver") db_container="paperless-db" ;;
esac
if [ -n "$db_container" ]; then
if docker ps --filter "name=$db_container" --format "{{.Names}}" | grep -q "$db_container"; then
echo " ✅ Database container running: $db_container"
# Test database connection
if docker exec "$db_container" pg_isready >/dev/null 2>&1; then
echo " ✅ Database accepting connections"
else
echo " ❌ Database not accepting connections"
dependency_issues+=("{{ service.name }}:database_connection")
fi
else
echo " ❌ Database container not running: $db_container"
dependency_issues+=("{{ service.name }}:database_down")
fi
fi
{% endif %}
{% endfor %}
# Check Redis dependencies
{% for service in current_health_checks %}
{% if service.name in ['immich-server'] %}
echo "🔍 Checking {{ service.name }} Redis dependency..."
redis_container=""
case "{{ service.name }}" in
"immich-server") redis_container="immich-redis" ;;
esac
if [ -n "$redis_container" ]; then
if docker ps --filter "name=$redis_container" --format "{{.Names}}" | grep -q "$redis_container"; then
echo " ✅ Redis container running: $redis_container"
# Test Redis connection
if docker exec "$redis_container" redis-cli ping | grep -q "PONG"; then
echo " ✅ Redis responding to ping"
else
echo " ❌ Redis not responding"
dependency_issues+=("{{ service.name }}:redis_connection")
fi
else
echo " ❌ Redis container not running: $redis_container"
dependency_issues+=("{{ service.name }}:redis_down")
fi
fi
{% endif %}
{% endfor %}
echo ""
echo "📊 DEPENDENCY SUMMARY:"
echo "Issues found: ${#dependency_issues[@]}"
if [ ${#dependency_issues[@]} -gt 0 ]; then
echo "🚨 DEPENDENCY ISSUES:"
for issue in "${dependency_issues[@]}"; do
echo " - $issue"
done
fi
register: dependency_health
changed_when: false
- name: Analyze service logs for errors
shell: |
echo "=== SERVICE LOG ANALYSIS ==="
log_issues=()
{% for service in current_health_checks %}
echo "📝 Analyzing {{ service.name }} logs..."
if docker ps --filter "name={{ service.container }}" --format "{{.Names}}" | grep -q "{{ service.container }}"; then
# Get recent logs and check for errors
error_count=$(docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(error|exception|failed|fatal|panic)" | wc -l)
warn_count=$(docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(warn|warning)" | wc -l)
echo " Errors (1h): $error_count"
echo " Warnings (1h): $warn_count"
if [ $error_count -gt 10 ]; then
echo " ⚠️ High error count detected"
log_issues+=("{{ service.name }}:high_error_count:$error_count")
elif [ $error_count -gt 0 ]; then
echo " Some errors detected"
else
echo " ✅ No errors in recent logs"
fi
# Show recent critical errors
if [ $error_count -gt 0 ]; then
echo " Recent errors:"
docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(error|exception|failed|fatal|panic)" | tail -3 | sed 's/^/ /'
fi
else
echo " ❌ Container not running"
fi
echo ""
{% endfor %}
echo "📊 LOG ANALYSIS SUMMARY:"
echo "Issues found: ${#log_issues[@]}"
if [ ${#log_issues[@]} -gt 0 ]; then
echo "🚨 LOG ISSUES:"
for issue in "${log_issues[@]}"; do
echo " - $issue"
done
fi
register: log_analysis
changed_when: false
- name: Generate comprehensive health report
copy:
content: |
🏥 DEEP SERVICE HEALTH REPORT - {{ inventory_hostname }}
=====================================================
📅 Health Check Date: {{ ansible_date_time.iso8601 }}
🖥️ Host: {{ inventory_hostname }}
📊 Services Checked: {{ current_health_checks | length }}
⏱️ Check Timeout: {{ health_check_timeout }}s
🐳 DOCKER DAEMON HEALTH:
{{ docker_health.stdout }}
📦 CONTAINER HEALTH:
{{ container_health.stdout }}
🌐 ENDPOINT HEALTH:
{{ endpoint_health.stdout }}
{% if include_performance %}
📊 SYSTEM PERFORMANCE:
{{ system_performance.stdout }}
{% endif %}
🔗 SERVICE DEPENDENCIES:
{{ dependency_health.stdout }}
📝 LOG ANALYSIS:
{{ log_analysis.stdout }}
🎯 CRITICAL SERVICES STATUS:
{% for service in current_health_checks %}
{% if service.critical %}
- {{ service.name }}: {% if service.container in container_health.stdout %}✅ Running{% else %}❌ Issues{% endif %}
{% endif %}
{% endfor %}
💡 RECOMMENDATIONS:
{% if 'Issues found: 0' not in container_health.stdout %}
- 🚨 Address container issues immediately
{% endif %}
{% if 'Issues found: 0' not in endpoint_health.stdout %}
- 🌐 Check service endpoint connectivity
{% endif %}
{% if 'Issues found: 0' not in dependency_health.stdout %}
- 🔗 Resolve service dependency issues
{% endif %}
- 📊 Monitor resource usage trends
- 🔄 Schedule regular health checks
- 📝 Set up log monitoring alerts
✅ HEALTH CHECK COMPLETE
dest: "{{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_report.txt"
delegate_to: localhost
- name: Create health status JSON for automation
copy:
content: |
{
"timestamp": "{{ ansible_date_time.iso8601 }}",
"hostname": "{{ inventory_hostname }}",
"health_check_summary": {
"total_services": {{ current_health_checks | length }},
"critical_services": {{ current_health_checks | selectattr('critical', 'equalto', true) | list | length }},
"docker_healthy": {{ 'true' if 'Docker daemon: Running' in docker_health.stdout else 'false' }},
"overall_status": "{% if 'Issues found: 0' in container_health.stdout and 'Issues found: 0' in endpoint_health.stdout %}HEALTHY{% else %}ISSUES_DETECTED{% endif %}"
},
"services": [
{% for service in current_health_checks %}
{
"name": "{{ service.name }}",
"container": "{{ service.container }}",
"critical": {{ service.critical | lower }},
"status": "{% if service.container in container_health.stdout %}running{% else %}down{% endif %}"
}{% if not loop.last %},{% endif %}
{% endfor %}
]
}
dest: "{{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_status.json"
delegate_to: localhost
- name: Display health check summary
debug:
msg: |
🏥 DEEP HEALTH CHECK COMPLETE - {{ inventory_hostname }}
===============================================
📅 Date: {{ ansible_date_time.date }}
📊 Services: {{ current_health_checks | length }}
🎯 CRITICAL SERVICES:
{% for service in current_health_checks %}
{% if service.critical %}
- {{ service.name }}: {% if service.container in container_health.stdout %}✅ OK{% else %}❌ ISSUES{% endif %}
{% endif %}
{% endfor %}
📊 SUMMARY:
- Docker: {{ '✅ Healthy' if 'Docker daemon: Running' in docker_health.stdout else '❌ Issues' }}
- Containers: {{ '✅ All OK' if 'Issues found: 0' in container_health.stdout else '⚠️ Issues Found' }}
- Endpoints: {{ '✅ All OK' if 'Issues found: 0' in endpoint_health.stdout else '⚠️ Issues Found' }}
- Dependencies: {{ '✅ All OK' if 'Issues found: 0' in dependency_health.stdout else '⚠️ Issues Found' }}
📄 Reports:
- {{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_report.txt
- {{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_status.json
🔍 Next Steps:
- Review detailed report for specific issues
- Address any critical service problems
- Schedule regular health monitoring
===============================================
- name: Send health alerts (if issues detected)
debug:
msg: |
🚨 HEALTH ALERT - {{ inventory_hostname }}
Critical issues detected in service health check!
Check the detailed report immediately.
when:
- alert_on_issues | bool
- "'ISSUES_DETECTED' in lookup('file', report_dir + '/' + ansible_date_time.date + '/' + inventory_hostname + '_health_status.json')"