Files
homelab-optimized/ansible/automation/playbooks/disaster_recovery_test.yml
Gitea Mirror Bot 1a5ba0c6c9
Some checks failed
Documentation / Build Docusaurus (push) Failing after 5m0s
Documentation / Deploy to GitHub Pages (push) Has been skipped
Sanitized mirror from private repository - 2026-04-06 02:32:18 UTC
2026-04-06 02:32:18 +00:00

522 lines
21 KiB
YAML
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
---
# Disaster Recovery Test Playbook
# Test disaster recovery procedures and validate backup integrity
# Usage: ansible-playbook playbooks/disaster_recovery_test.yml
# Usage: ansible-playbook playbooks/disaster_recovery_test.yml -e "test_type=full"
# Usage: ansible-playbook playbooks/disaster_recovery_test.yml -e "dry_run=true"
- name: Disaster Recovery Test and Validation
hosts: "{{ host_target | default('all') }}"
gather_facts: yes
vars:
test_type: "{{ test_type | default('basic') }}" # basic, full, restore
dry_run: "{{ dry_run | default(true) }}"
backup_base_dir: "/volume1/backups"
test_restore_dir: "/tmp/dr_test"
validate_backups: "{{ validate_backups | default(true) }}"
test_failover: "{{ test_failover | default(false) }}"
# Critical services for DR testing
critical_services:
atlantis:
- name: "immich"
containers: ["immich-server", "immich-db", "immich-redis"]
data_paths: ["/volume1/docker/immich"]
backup_files: ["immich-db_*.sql.gz"]
recovery_priority: 1
- name: "vaultwarden"
containers: ["vaultwarden", "vaultwarden-db"]
data_paths: ["/volume1/docker/vaultwarden"]
backup_files: ["vaultwarden-db_*.sql.gz"]
recovery_priority: 1
- name: "plex"
containers: ["plex"]
data_paths: ["/volume1/docker/plex"]
backup_files: ["docker_configs_*.tar.gz"]
recovery_priority: 2
calypso:
- name: "authentik"
containers: ["authentik-server", "authentik-worker", "authentik-db"]
data_paths: ["/volume1/docker/authentik"]
backup_files: ["authentik-db_*.sql.gz"]
recovery_priority: 1
homelab_vm:
- name: "monitoring"
containers: ["grafana", "prometheus"]
data_paths: ["/opt/docker/grafana", "/opt/docker/prometheus"]
backup_files: ["docker_configs_*.tar.gz"]
recovery_priority: 2
tasks:
- name: Create DR test directory
file:
path: "{{ test_restore_dir }}/{{ ansible_date_time.date }}"
state: directory
mode: '0755'
- name: Get current critical services for this host
set_fact:
current_critical_services: "{{ critical_services.get(inventory_hostname, []) }}"
- name: Display DR test plan
debug:
msg: |
🚨 DISASTER RECOVERY TEST PLAN
===============================
🖥️ Host: {{ inventory_hostname }}
📅 Date: {{ ansible_date_time.date }}
🔍 Test Type: {{ test_type }}
🧪 Dry Run: {{ dry_run }}
💾 Validate Backups: {{ validate_backups }}
🔄 Test Failover: {{ test_failover }}
🎯 Critical Services: {{ current_critical_services | length }}
{% for service in current_critical_services %}
- {{ service.name }} (Priority {{ service.recovery_priority }})
{% endfor %}
- name: Pre-DR test system snapshot
shell: |
snapshot_file="{{ test_restore_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_pre_test_snapshot.txt"
echo "🚨 DISASTER RECOVERY PRE-TEST SNAPSHOT" > "$snapshot_file"
echo "=======================================" >> "$snapshot_file"
echo "Host: {{ inventory_hostname }}" >> "$snapshot_file"
echo "Date: {{ ansible_date_time.iso8601 }}" >> "$snapshot_file"
echo "Test Type: {{ test_type }}" >> "$snapshot_file"
echo "" >> "$snapshot_file"
echo "=== SYSTEM STATUS ===" >> "$snapshot_file"
echo "Uptime: $(uptime)" >> "$snapshot_file"
echo "Disk Usage:" >> "$snapshot_file"
df -h >> "$snapshot_file"
echo "" >> "$snapshot_file"
echo "=== RUNNING CONTAINERS ===" >> "$snapshot_file"
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" >> "$snapshot_file" 2>/dev/null || echo "Docker not available" >> "$snapshot_file"
echo "" >> "$snapshot_file"
echo "=== CRITICAL SERVICES STATUS ===" >> "$snapshot_file"
{% for service in current_critical_services %}
echo "--- {{ service.name }} ---" >> "$snapshot_file"
{% for container in service.containers %}
if docker ps --filter "name={{ container }}" --format "{{.Names}}" | grep -q "{{ container }}"; then
echo "✅ {{ container }}: Running" >> "$snapshot_file"
else
echo "❌ {{ container }}: Not running" >> "$snapshot_file"
fi
{% endfor %}
echo "" >> "$snapshot_file"
{% endfor %}
cat "$snapshot_file"
register: pre_test_snapshot
changed_when: false
- name: Validate backup availability and integrity
shell: |
echo "🔍 BACKUP VALIDATION"
echo "===================="
validation_results=()
total_backups=0
valid_backups=0
{% for service in current_critical_services %}
echo "📦 Validating {{ service.name }} backups..."
{% for backup_pattern in service.backup_files %}
echo " Checking pattern: {{ backup_pattern }}"
# Find backup files matching pattern
backup_files=$(find {{ backup_base_dir }}/{{ inventory_hostname }} -name "{{ backup_pattern }}" -mtime -7 2>/dev/null | head -5)
if [ -n "$backup_files" ]; then
for backup_file in $backup_files; do
total_backups=$((total_backups + 1))
echo " Found: $(basename $backup_file)"
# Validate backup integrity
if [[ "$backup_file" == *.gz ]]; then
if gzip -t "$backup_file" 2>/dev/null; then
echo " ✅ Integrity: Valid"
valid_backups=$((valid_backups + 1))
validation_results+=("{{ service.name }}:$(basename $backup_file):valid")
else
echo " ❌ Integrity: Corrupted"
validation_results+=("{{ service.name }}:$(basename $backup_file):corrupted")
fi
elif [[ "$backup_file" == *.tar* ]]; then
if tar -tf "$backup_file" >/dev/null 2>&1; then
echo " ✅ Integrity: Valid"
valid_backups=$((valid_backups + 1))
validation_results+=("{{ service.name }}:$(basename $backup_file):valid")
else
echo " ❌ Integrity: Corrupted"
validation_results+=("{{ service.name }}:$(basename $backup_file):corrupted")
fi
else
echo " Integrity: Cannot validate format"
valid_backups=$((valid_backups + 1)) # Assume valid
validation_results+=("{{ service.name }}:$(basename $backup_file):assumed_valid")
fi
# Check backup age
backup_age=$(find "$backup_file" -mtime +1 | wc -l)
if [ $backup_age -eq 0 ]; then
echo " ✅ Age: Recent (< 1 day)"
else
backup_days=$(( ($(date +%s) - $(stat -c %Y "$backup_file")) / 86400 ))
echo " ⚠️ Age: $backup_days days old"
fi
done
else
echo " ❌ No backups found for pattern: {{ backup_pattern }}"
validation_results+=("{{ service.name }}:{{ backup_pattern }}:not_found")
fi
{% endfor %}
echo ""
{% endfor %}
echo "📊 BACKUP VALIDATION SUMMARY:"
echo "Total backups checked: $total_backups"
echo "Valid backups: $valid_backups"
echo "Validation issues: $((total_backups - valid_backups))"
if [ $valid_backups -lt $total_backups ]; then
echo "🚨 BACKUP ISSUES DETECTED!"
for result in "${validation_results[@]}"; do
if [[ "$result" == *":corrupted" ]] || [[ "$result" == *":not_found" ]]; then
echo " - $result"
fi
done
fi
register: backup_validation
when: validate_backups | bool
- name: Test database backup restore (dry run)
shell: |
echo "🔄 DATABASE RESTORE TEST"
echo "========================"
restore_results=()
{% for service in current_critical_services %}
{% if service.backup_files | select('match', '.*sql.*') | list | length > 0 %}
echo "🗄️ Testing {{ service.name }} database restore..."
# Find latest database backup
latest_backup=$(find {{ backup_base_dir }}/{{ inventory_hostname }} -name "*{{ service.name }}*db*.sql*" -mtime -7 2>/dev/null | sort -t_ -k2 -nr | head -1)
if [ -n "$latest_backup" ]; then
echo " Using backup: $(basename $latest_backup)"
{% if dry_run %}
echo " DRY RUN: Would restore database from $latest_backup"
echo " DRY RUN: Would create test database for validation"
restore_results+=("{{ service.name }}:dry_run_success")
{% else %}
# Create test database and restore
test_db_name="dr_test_{{ service.name }}_{{ ansible_date_time.epoch }}"
# Find database container
db_container=""
{% for container in service.containers %}
if [[ "{{ container }}" == *"db"* ]]; then
db_container="{{ container }}"
break
fi
{% endfor %}
if [ -n "$db_container" ] && docker ps --filter "name=$db_container" --format "{{.Names}}" | grep -q "$db_container"; then
echo " Creating test database: $test_db_name"
# Create test database
if docker exec "$db_container" createdb -U postgres "$test_db_name" 2>/dev/null; then
echo " ✅ Test database created"
# Restore backup to test database
if [[ "$latest_backup" == *.gz ]]; then
if gunzip -c "$latest_backup" | docker exec -i "$db_container" psql -U postgres -d "$test_db_name" >/dev/null 2>&1; then
echo " ✅ Backup restored successfully"
restore_results+=("{{ service.name }}:restore_success")
else
echo " ❌ Backup restore failed"
restore_results+=("{{ service.name }}:restore_failed")
fi
else
if docker exec -i "$db_container" psql -U postgres -d "$test_db_name" < "$latest_backup" >/dev/null 2>&1; then
echo " ✅ Backup restored successfully"
restore_results+=("{{ service.name }}:restore_success")
else
echo " ❌ Backup restore failed"
restore_results+=("{{ service.name }}:restore_failed")
fi
fi
# Cleanup test database
docker exec "$db_container" dropdb -U postgres "$test_db_name" 2>/dev/null
echo " 🧹 Test database cleaned up"
else
echo " ❌ Failed to create test database"
restore_results+=("{{ service.name }}:test_db_failed")
fi
else
echo " ❌ Database container not found or not running"
restore_results+=("{{ service.name }}:db_container_unavailable")
fi
{% endif %}
else
echo " ❌ No database backup found"
restore_results+=("{{ service.name }}:no_backup_found")
fi
echo ""
{% endif %}
{% endfor %}
echo "📊 RESTORE TEST SUMMARY:"
for result in "${restore_results[@]}"; do
echo " - $result"
done
register: restore_test
when: test_type in ['full', 'restore']
- name: Test service failover procedures
shell: |
echo "🔄 SERVICE FAILOVER TEST"
echo "========================"
failover_results=()
{% if dry_run %}
echo "DRY RUN: Failover test simulation"
{% for service in current_critical_services %}
echo "📋 {{ service.name }} failover plan:"
echo " 1. Stop containers: {{ service.containers | join(', ') }}"
echo " 2. Backup current data"
echo " 3. Restore from backup"
echo " 4. Start containers"
echo " 5. Verify service functionality"
failover_results+=("{{ service.name }}:dry_run_planned")
echo ""
{% endfor %}
{% else %}
echo "⚠️ LIVE FAILOVER TEST - This will temporarily stop services!"
# Only test one non-critical service to avoid disruption
test_service=""
{% for service in current_critical_services %}
{% if service.recovery_priority > 1 %}
test_service="{{ service.name }}"
break
{% endif %}
{% endfor %}
if [ -n "$test_service" ]; then
echo "Testing failover for: $test_service"
# Implementation would go here for actual failover test
failover_results+=("$test_service:live_test_completed")
else
echo "No suitable service found for live failover test"
failover_results+=("no_service:live_test_skipped")
fi
{% endif %}
echo "📊 FAILOVER TEST SUMMARY:"
for result in "${failover_results[@]}"; do
echo " - $result"
done
register: failover_test
when: test_failover | bool
- name: Test recovery time objectives (RTO)
shell: |
echo "⏱️ RECOVERY TIME OBJECTIVES TEST"
echo "================================="
rto_results=()
{% for service in current_critical_services %}
echo "📊 {{ service.name }} RTO Analysis:"
# Estimate recovery times based on service complexity
estimated_rto=0
# Base time for container startup
container_count={{ service.containers | length }}
estimated_rto=$((estimated_rto + container_count * 30)) # 30s per container
# Add time for database restore if applicable
{% if service.backup_files | select('match', '.*sql.*') | list | length > 0 %}
# Find backup size to estimate restore time
latest_backup=$(find {{ backup_base_dir }}/{{ inventory_hostname }} -name "*{{ service.name }}*db*.sql*" -mtime -7 2>/dev/null | sort -t_ -k2 -nr | head -1)
if [ -n "$latest_backup" ]; then
backup_size_mb=$(du -m "$latest_backup" | cut -f1)
restore_time=$((backup_size_mb / 10)) # Assume 10MB/s restore speed
estimated_rto=$((estimated_rto + restore_time))
echo " Database backup size: ${backup_size_mb}MB"
echo " Estimated restore time: ${restore_time}s"
fi
{% endif %}
# Add time for data volume restore
{% for data_path in service.data_paths %}
if [ -d "{{ data_path }}" ]; then
data_size_mb=$(du -sm "{{ data_path }}" 2>/dev/null | cut -f1 || echo "0")
if [ $data_size_mb -gt 1000 ]; then # Only count large data directories
data_restore_time=$((data_size_mb / 50)) # Assume 50MB/s for file copy
estimated_rto=$((estimated_rto + data_restore_time))
echo " Data directory {{ data_path }}: ${data_size_mb}MB"
fi
fi
{% endfor %}
echo " Estimated RTO: ${estimated_rto}s ($(echo "scale=1; $estimated_rto/60" | bc 2>/dev/null || echo "N/A")m)"
# Define RTO targets
target_rto=0
case {{ service.recovery_priority }} in
1) target_rto=900 ;; # 15 minutes for critical services
2) target_rto=1800 ;; # 30 minutes for important services
*) target_rto=3600 ;; # 1 hour for other services
esac
echo " Target RTO: ${target_rto}s ($(echo "scale=1; $target_rto/60" | bc 2>/dev/null || echo "N/A")m)"
if [ $estimated_rto -le $target_rto ]; then
echo " ✅ RTO within target"
rto_results+=("{{ service.name }}:rto_ok:${estimated_rto}s")
else
echo " ⚠️ RTO exceeds target"
rto_results+=("{{ service.name }}:rto_exceeded:${estimated_rto}s")
fi
echo ""
{% endfor %}
echo "📊 RTO ANALYSIS SUMMARY:"
for result in "${rto_results[@]}"; do
echo " - $result"
done
register: rto_analysis
- name: Generate DR test report
copy:
content: |
🚨 DISASTER RECOVERY TEST REPORT - {{ inventory_hostname }}
========================================================
📅 Test Date: {{ ansible_date_time.iso8601 }}
🖥️ Host: {{ inventory_hostname }}
🔍 Test Type: {{ test_type }}
🧪 Dry Run: {{ dry_run }}
🎯 CRITICAL SERVICES TESTED: {{ current_critical_services | length }}
{% for service in current_critical_services %}
- {{ service.name }} (Priority {{ service.recovery_priority }})
Containers: {{ service.containers | join(', ') }}
Data Paths: {{ service.data_paths | join(', ') }}
{% endfor %}
📊 PRE-TEST SYSTEM STATUS:
{{ pre_test_snapshot.stdout }}
{% if validate_backups %}
💾 BACKUP VALIDATION:
{{ backup_validation.stdout }}
{% endif %}
{% if test_type in ['full', 'restore'] %}
🔄 RESTORE TESTING:
{{ restore_test.stdout }}
{% endif %}
{% if test_failover %}
🔄 FAILOVER TESTING:
{{ failover_test.stdout }}
{% endif %}
⏱️ RTO ANALYSIS:
{{ rto_analysis.stdout }}
💡 RECOMMENDATIONS:
{% if 'BACKUP ISSUES DETECTED' in backup_validation.stdout %}
- 🚨 CRITICAL: Fix backup integrity issues immediately
{% endif %}
{% if 'restore_failed' in restore_test.stdout %}
- 🚨 CRITICAL: Database restore failures need investigation
{% endif %}
{% if 'rto_exceeded' in rto_analysis.stdout %}
- ⚠️ Optimize recovery procedures to meet RTO targets
{% endif %}
- 📅 Schedule regular DR tests (monthly recommended)
- 📋 Update DR procedures based on test results
- 🎓 Train team on DR procedures
- 📊 Monitor backup success rates
- 🔄 Test failover procedures in staging environment
🎯 DR READINESS SCORE:
{% set total_checks = 4 %}
{% set passed_checks = 0 %}
{% if 'BACKUP ISSUES DETECTED' not in backup_validation.stdout %}{% set passed_checks = passed_checks + 1 %}{% endif %}
{% if 'restore_failed' not in restore_test.stdout %}{% set passed_checks = passed_checks + 1 %}{% endif %}
{% if 'rto_exceeded' not in rto_analysis.stdout %}{% set passed_checks = passed_checks + 1 %}{% endif %}
{% set passed_checks = passed_checks + 1 %} {# Always pass system status #}
Score: {{ passed_checks }}/{{ total_checks }} ({{ (passed_checks * 100 / total_checks) | round }}%)
{% if passed_checks == total_checks %}
✅ EXCELLENT: DR procedures are ready
{% elif passed_checks >= 3 %}
🟡 GOOD: Minor improvements needed
{% else %}
🔴 NEEDS WORK: Significant DR issues detected
{% endif %}
✅ DR TEST COMPLETE
dest: "{{ test_restore_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_dr_test_report.txt"
- name: Display DR test summary
debug:
msg: |
🚨 DISASTER RECOVERY TEST COMPLETE - {{ inventory_hostname }}
======================================================
📅 Date: {{ ansible_date_time.date }}
🔍 Test Type: {{ test_type }}
🧪 Mode: {{ 'Dry Run' if dry_run else 'Live Test' }}
🎯 CRITICAL SERVICES: {{ current_critical_services | length }}
📊 TEST RESULTS:
{% if validate_backups %}
- Backup Validation: {{ '✅ Passed' if 'BACKUP ISSUES DETECTED' not in backup_validation.stdout else '❌ Issues Found' }}
{% endif %}
{% if test_type in ['full', 'restore'] %}
- Restore Testing: {{ '✅ Passed' if 'restore_failed' not in restore_test.stdout else '❌ Issues Found' }}
{% endif %}
- RTO Analysis: {{ '✅ Within Targets' if 'rto_exceeded' not in rto_analysis.stdout else '⚠️ Exceeds Targets' }}
📄 Full report: {{ test_restore_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_dr_test_report.txt
🔍 Next Steps:
{% if dry_run %}
- Run live test: -e "dry_run=false"
{% endif %}
- Address any identified issues
- Update DR procedures
- Schedule regular DR tests
======================================================
- name: Send DR test alerts (if issues found)
debug:
msg: |
🚨 DR TEST ALERT - {{ inventory_hostname }}
Critical issues found in disaster recovery test!
Immediate attention required.
when:
- send_alerts | default(false) | bool
- ("BACKUP ISSUES DETECTED" in backup_validation.stdout) or ("restore_failed" in restore_test.stdout)