Sanitized mirror from private repository - 2026-04-18 11:19:59 UTC
Some checks failed
Documentation / Build Docusaurus (push) Failing after 5m14s
Documentation / Deploy to GitHub Pages (push) Has been skipped

This commit is contained in:
Gitea Mirror Bot
2026-04-18 11:19:59 +00:00
commit fb00a325d1
1418 changed files with 359990 additions and 0 deletions

View File

@@ -0,0 +1,524 @@
---
# Deep Service Health Check Playbook
# Comprehensive health monitoring for all homelab services
# Usage: ansible-playbook playbooks/service_health_deep.yml
# Usage: ansible-playbook playbooks/service_health_deep.yml -e "include_performance=true"
# Usage: ansible-playbook playbooks/service_health_deep.yml -e "alert_on_issues=true"
- name: Deep Service Health Check
hosts: "{{ host_target | default('all') }}"
gather_facts: yes
vars:
include_performance: "{{ include_performance | default(true) }}"
alert_on_issues: "{{ alert_on_issues | default(false) }}"
health_check_timeout: "{{ health_check_timeout | default(30) }}"
report_dir: "/tmp/health_reports"
# Service health check configurations
service_health_checks:
atlantis:
- name: "plex"
container: "plex"
health_url: "http://localhost:32400/web"
expected_status: 200
critical: true
- name: "immich-server"
container: "immich-server"
health_url: "http://localhost:2283/api/server-info/ping"
expected_status: 200
critical: true
- name: "vaultwarden"
container: "vaultwarden"
health_url: "http://localhost:80/alive"
expected_status: 200
critical: true
- name: "sonarr"
container: "sonarr"
health_url: "http://localhost:8989/api/v3/system/status"
expected_status: 200
critical: false
- name: "radarr"
container: "radarr"
health_url: "http://localhost:7878/api/v3/system/status"
expected_status: 200
critical: false
calypso:
- name: "authentik-server"
container: "authentik-server"
health_url: "http://localhost:9000/-/health/live/"
expected_status: 200
critical: true
- name: "paperless-webserver"
container: "paperless-webserver"
health_url: "http://localhost:8000"
expected_status: 200
critical: false
homelab_vm:
- name: "grafana"
container: "grafana"
health_url: "http://localhost:3000/api/health"
expected_status: 200
critical: true
- name: "prometheus"
container: "prometheus"
health_url: "http://localhost:9090/-/healthy"
expected_status: 200
critical: true
tasks:
- name: Create health report directory
file:
path: "{{ report_dir }}/{{ ansible_date_time.date }}"
state: directory
mode: '0755'
delegate_to: localhost
- name: Get current service health checks for this host
set_fact:
current_health_checks: "{{ service_health_checks.get(inventory_hostname, []) }}"
- name: Display health check plan
debug:
msg: |
🏥 DEEP HEALTH CHECK PLAN
=========================
🖥️ Host: {{ inventory_hostname }}
📅 Date: {{ ansible_date_time.date }}
🔍 Services to check: {{ current_health_checks | length }}
📊 Include Performance: {{ include_performance }}
🚨 Alert on Issues: {{ alert_on_issues }}
⏱️ Timeout: {{ health_check_timeout }}s
📋 Services:
{% for service in current_health_checks %}
- {{ service.name }} ({{ 'Critical' if service.critical else 'Non-critical' }})
{% endfor %}
- name: Check Docker daemon health
shell: |
echo "=== DOCKER DAEMON HEALTH ==="
# Check Docker daemon status
if systemctl is-active --quiet docker; then
echo "✅ Docker daemon: Running"
# Check Docker daemon responsiveness
if timeout 10 docker version >/dev/null 2>&1; then
echo "✅ Docker API: Responsive"
else
echo "❌ Docker API: Unresponsive"
fi
# Check Docker disk usage
docker_usage=$(docker system df --format "table {{.Type}}\t{{.TotalCount}}\t{{.Size}}\t{{.Reclaimable}}")
echo "📊 Docker Usage:"
echo "$docker_usage"
else
echo "❌ Docker daemon: Not running"
fi
register: docker_health
changed_when: false
- name: Check container health status
shell: |
echo "=== CONTAINER HEALTH STATUS ==="
health_issues=()
total_containers=0
healthy_containers=0
{% for service in current_health_checks %}
echo "🔍 Checking {{ service.name }}..."
total_containers=$((total_containers + 1))
# Check if container exists and is running
if docker ps --filter "name={{ service.container }}" --format "{{.Names}}" | grep -q "{{ service.container }}"; then
echo " ✅ Container running: {{ service.container }}"
# Check container health if health check is configured
health_status=$(docker inspect {{ service.container }} --format='{{.State.Health.Status}}' 2>/dev/null || echo "none")
if [ "$health_status" != "none" ]; then
if [ "$health_status" = "healthy" ]; then
echo " ✅ Health check: $health_status"
healthy_containers=$((healthy_containers + 1))
else
echo " ❌ Health check: $health_status"
health_issues+=("{{ service.name }}:health_check_failed")
fi
else
echo " No health check configured"
healthy_containers=$((healthy_containers + 1)) # Assume healthy if no health check
fi
# Check container resource usage
container_stats=$(docker stats {{ service.container }} --no-stream --format "CPU: {{.CPUPerc}}, Memory: {{.MemUsage}}" 2>/dev/null || echo "Stats unavailable")
echo " 📊 Resources: $container_stats"
else
echo " ❌ Container not running: {{ service.container }}"
health_issues+=("{{ service.name }}:container_down")
fi
echo ""
{% endfor %}
echo "📊 CONTAINER SUMMARY:"
echo "Total containers checked: $total_containers"
echo "Healthy containers: $healthy_containers"
echo "Issues found: ${#health_issues[@]}"
if [ ${#health_issues[@]} -gt 0 ]; then
echo "🚨 ISSUES:"
for issue in "${health_issues[@]}"; do
echo " - $issue"
done
fi
register: container_health
changed_when: false
- name: Test service endpoints
shell: |
echo "=== SERVICE ENDPOINT HEALTH ==="
endpoint_issues=()
total_endpoints=0
healthy_endpoints=0
{% for service in current_health_checks %}
{% if service.health_url is defined %}
echo "🌐 Testing {{ service.name }} endpoint..."
total_endpoints=$((total_endpoints + 1))
# Test HTTP endpoint
response_code=$(curl -s -o /dev/null -w "%{http_code}" --max-time {{ health_check_timeout }} "{{ service.health_url }}" 2>/dev/null || echo "000")
response_time=$(curl -s -o /dev/null -w "%{time_total}" --max-time {{ health_check_timeout }} "{{ service.health_url }}" 2>/dev/null || echo "timeout")
if [ "$response_code" = "{{ service.expected_status }}" ]; then
echo " ✅ HTTP $response_code (${response_time}s): {{ service.health_url }}"
healthy_endpoints=$((healthy_endpoints + 1))
else
echo " ❌ HTTP $response_code (expected {{ service.expected_status }}): {{ service.health_url }}"
endpoint_issues+=("{{ service.name }}:http_$response_code")
fi
{% endif %}
{% endfor %}
echo ""
echo "📊 ENDPOINT SUMMARY:"
echo "Total endpoints tested: $total_endpoints"
echo "Healthy endpoints: $healthy_endpoints"
echo "Issues found: ${#endpoint_issues[@]}"
if [ ${#endpoint_issues[@]} -gt 0 ]; then
echo "🚨 ENDPOINT ISSUES:"
for issue in "${endpoint_issues[@]}"; do
echo " - $issue"
done
fi
register: endpoint_health
changed_when: false
- name: Check system resources and performance
shell: |
echo "=== SYSTEM PERFORMANCE ==="
# CPU usage
cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
echo "🖥️ CPU Usage: ${cpu_usage}%"
# Memory usage
memory_info=$(free -h | awk 'NR==2{printf "Used: %s/%s (%.1f%%)", $3, $2, $3*100/$2}')
echo "💾 Memory: $memory_info"
# Disk usage for critical paths
echo "💿 Disk Usage:"
df -h / | tail -1 | awk '{printf " Root: %s used (%s)\n", $5, $4}'
{% if inventory_hostname in ['atlantis', 'calypso'] %}
# Synology specific checks
if [ -d "/volume1" ]; then
df -h /volume1 | tail -1 | awk '{printf " Volume1: %s used (%s)\n", $5, $4}'
fi
{% endif %}
# Load average
load_avg=$(uptime | awk -F'load average:' '{print $2}')
echo "⚖️ Load Average:$load_avg"
# Network connectivity
echo "🌐 Network:"
if ping -c 1 8.8.8.8 >/dev/null 2>&1; then
echo " ✅ Internet connectivity"
else
echo " ❌ Internet connectivity failed"
fi
# Tailscale status
if command -v tailscale >/dev/null 2>&1; then
tailscale_status=$(tailscale status --json 2>/dev/null | jq -r '.Self.Online' 2>/dev/null || echo "unknown")
if [ "$tailscale_status" = "true" ]; then
echo " ✅ Tailscale connected"
else
echo " ❌ Tailscale status: $tailscale_status"
fi
fi
register: system_performance
when: include_performance | bool
changed_when: false
- name: Check critical service dependencies
shell: |
echo "=== SERVICE DEPENDENCIES ==="
dependency_issues=()
# Check database connections for services that need them
{% for service in current_health_checks %}
{% if service.name in ['immich-server', 'vaultwarden', 'authentik-server', 'paperless-webserver'] %}
echo "🔍 Checking {{ service.name }} database dependency..."
# Try to find associated database container
db_container=""
case "{{ service.name }}" in
"immich-server") db_container="immich-db" ;;
"vaultwarden") db_container="vaultwarden-db" ;;
"authentik-server") db_container="authentik-db" ;;
"paperless-webserver") db_container="paperless-db" ;;
esac
if [ -n "$db_container" ]; then
if docker ps --filter "name=$db_container" --format "{{.Names}}" | grep -q "$db_container"; then
echo " ✅ Database container running: $db_container"
# Test database connection
if docker exec "$db_container" pg_isready >/dev/null 2>&1; then
echo " ✅ Database accepting connections"
else
echo " ❌ Database not accepting connections"
dependency_issues+=("{{ service.name }}:database_connection")
fi
else
echo " ❌ Database container not running: $db_container"
dependency_issues+=("{{ service.name }}:database_down")
fi
fi
{% endif %}
{% endfor %}
# Check Redis dependencies
{% for service in current_health_checks %}
{% if service.name in ['immich-server'] %}
echo "🔍 Checking {{ service.name }} Redis dependency..."
redis_container=""
case "{{ service.name }}" in
"immich-server") redis_container="immich-redis" ;;
esac
if [ -n "$redis_container" ]; then
if docker ps --filter "name=$redis_container" --format "{{.Names}}" | grep -q "$redis_container"; then
echo " ✅ Redis container running: $redis_container"
# Test Redis connection
if docker exec "$redis_container" redis-cli ping | grep -q "PONG"; then
echo " ✅ Redis responding to ping"
else
echo " ❌ Redis not responding"
dependency_issues+=("{{ service.name }}:redis_connection")
fi
else
echo " ❌ Redis container not running: $redis_container"
dependency_issues+=("{{ service.name }}:redis_down")
fi
fi
{% endif %}
{% endfor %}
echo ""
echo "📊 DEPENDENCY SUMMARY:"
echo "Issues found: ${#dependency_issues[@]}"
if [ ${#dependency_issues[@]} -gt 0 ]; then
echo "🚨 DEPENDENCY ISSUES:"
for issue in "${dependency_issues[@]}"; do
echo " - $issue"
done
fi
register: dependency_health
changed_when: false
- name: Analyze service logs for errors
shell: |
echo "=== SERVICE LOG ANALYSIS ==="
log_issues=()
{% for service in current_health_checks %}
echo "📝 Analyzing {{ service.name }} logs..."
if docker ps --filter "name={{ service.container }}" --format "{{.Names}}" | grep -q "{{ service.container }}"; then
# Get recent logs and check for errors
error_count=$(docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(error|exception|failed|fatal|panic)" | wc -l)
warn_count=$(docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(warn|warning)" | wc -l)
echo " Errors (1h): $error_count"
echo " Warnings (1h): $warn_count"
if [ $error_count -gt 10 ]; then
echo " ⚠️ High error count detected"
log_issues+=("{{ service.name }}:high_error_count:$error_count")
elif [ $error_count -gt 0 ]; then
echo " Some errors detected"
else
echo " ✅ No errors in recent logs"
fi
# Show recent critical errors
if [ $error_count -gt 0 ]; then
echo " Recent errors:"
docker logs {{ service.container }} --since=1h 2>&1 | grep -i -E "(error|exception|failed|fatal|panic)" | tail -3 | sed 's/^/ /'
fi
else
echo " ❌ Container not running"
fi
echo ""
{% endfor %}
echo "📊 LOG ANALYSIS SUMMARY:"
echo "Issues found: ${#log_issues[@]}"
if [ ${#log_issues[@]} -gt 0 ]; then
echo "🚨 LOG ISSUES:"
for issue in "${log_issues[@]}"; do
echo " - $issue"
done
fi
register: log_analysis
changed_when: false
- name: Generate comprehensive health report
copy:
content: |
🏥 DEEP SERVICE HEALTH REPORT - {{ inventory_hostname }}
=====================================================
📅 Health Check Date: {{ ansible_date_time.iso8601 }}
🖥️ Host: {{ inventory_hostname }}
📊 Services Checked: {{ current_health_checks | length }}
⏱️ Check Timeout: {{ health_check_timeout }}s
🐳 DOCKER DAEMON HEALTH:
{{ docker_health.stdout }}
📦 CONTAINER HEALTH:
{{ container_health.stdout }}
🌐 ENDPOINT HEALTH:
{{ endpoint_health.stdout }}
{% if include_performance %}
📊 SYSTEM PERFORMANCE:
{{ system_performance.stdout }}
{% endif %}
🔗 SERVICE DEPENDENCIES:
{{ dependency_health.stdout }}
📝 LOG ANALYSIS:
{{ log_analysis.stdout }}
🎯 CRITICAL SERVICES STATUS:
{% for service in current_health_checks %}
{% if service.critical %}
- {{ service.name }}: {% if service.container in container_health.stdout %}✅ Running{% else %}❌ Issues{% endif %}
{% endif %}
{% endfor %}
💡 RECOMMENDATIONS:
{% if 'Issues found: 0' not in container_health.stdout %}
- 🚨 Address container issues immediately
{% endif %}
{% if 'Issues found: 0' not in endpoint_health.stdout %}
- 🌐 Check service endpoint connectivity
{% endif %}
{% if 'Issues found: 0' not in dependency_health.stdout %}
- 🔗 Resolve service dependency issues
{% endif %}
- 📊 Monitor resource usage trends
- 🔄 Schedule regular health checks
- 📝 Set up log monitoring alerts
✅ HEALTH CHECK COMPLETE
dest: "{{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_report.txt"
delegate_to: localhost
- name: Create health status JSON for automation
copy:
content: |
{
"timestamp": "{{ ansible_date_time.iso8601 }}",
"hostname": "{{ inventory_hostname }}",
"health_check_summary": {
"total_services": {{ current_health_checks | length }},
"critical_services": {{ current_health_checks | selectattr('critical', 'equalto', true) | list | length }},
"docker_healthy": {{ 'true' if 'Docker daemon: Running' in docker_health.stdout else 'false' }},
"overall_status": "{% if 'Issues found: 0' in container_health.stdout and 'Issues found: 0' in endpoint_health.stdout %}HEALTHY{% else %}ISSUES_DETECTED{% endif %}"
},
"services": [
{% for service in current_health_checks %}
{
"name": "{{ service.name }}",
"container": "{{ service.container }}",
"critical": {{ service.critical | lower }},
"status": "{% if service.container in container_health.stdout %}running{% else %}down{% endif %}"
}{% if not loop.last %},{% endif %}
{% endfor %}
]
}
dest: "{{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_status.json"
delegate_to: localhost
- name: Display health check summary
debug:
msg: |
🏥 DEEP HEALTH CHECK COMPLETE - {{ inventory_hostname }}
===============================================
📅 Date: {{ ansible_date_time.date }}
📊 Services: {{ current_health_checks | length }}
🎯 CRITICAL SERVICES:
{% for service in current_health_checks %}
{% if service.critical %}
- {{ service.name }}: {% if service.container in container_health.stdout %}✅ OK{% else %}❌ ISSUES{% endif %}
{% endif %}
{% endfor %}
📊 SUMMARY:
- Docker: {{ '✅ Healthy' if 'Docker daemon: Running' in docker_health.stdout else '❌ Issues' }}
- Containers: {{ '✅ All OK' if 'Issues found: 0' in container_health.stdout else '⚠️ Issues Found' }}
- Endpoints: {{ '✅ All OK' if 'Issues found: 0' in endpoint_health.stdout else '⚠️ Issues Found' }}
- Dependencies: {{ '✅ All OK' if 'Issues found: 0' in dependency_health.stdout else '⚠️ Issues Found' }}
📄 Reports:
- {{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_report.txt
- {{ report_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_health_status.json
🔍 Next Steps:
- Review detailed report for specific issues
- Address any critical service problems
- Schedule regular health monitoring
===============================================
- name: Send health alerts (if issues detected)
debug:
msg: |
🚨 HEALTH ALERT - {{ inventory_hostname }}
Critical issues detected in service health check!
Check the detailed report immediately.
when:
- alert_on_issues | bool
- "'ISSUES_DETECTED' in lookup('file', report_dir + '/' + ansible_date_time.date + '/' + inventory_hostname + '_health_status.json')"