--- # Disaster Recovery Test Playbook # Test disaster recovery procedures and validate backup integrity # Usage: ansible-playbook playbooks/disaster_recovery_test.yml # Usage: ansible-playbook playbooks/disaster_recovery_test.yml -e "test_type=full" # Usage: ansible-playbook playbooks/disaster_recovery_test.yml -e "dry_run=true" - name: Disaster Recovery Test and Validation hosts: "{{ host_target | default('all') }}" gather_facts: yes vars: test_type: "{{ test_type | default('basic') }}" # basic, full, restore dry_run: "{{ dry_run | default(true) }}" backup_base_dir: "/volume1/backups" test_restore_dir: "/tmp/dr_test" validate_backups: "{{ validate_backups | default(true) }}" test_failover: "{{ test_failover | default(false) }}" # Critical services for DR testing critical_services: atlantis: - name: "immich" containers: ["immich-server", "immich-db", "immich-redis"] data_paths: ["/volume1/docker/immich"] backup_files: ["immich-db_*.sql.gz"] recovery_priority: 1 - name: "vaultwarden" containers: ["vaultwarden", "vaultwarden-db"] data_paths: ["/volume1/docker/vaultwarden"] backup_files: ["vaultwarden-db_*.sql.gz"] recovery_priority: 1 - name: "plex" containers: ["plex"] data_paths: ["/volume1/docker/plex"] backup_files: ["docker_configs_*.tar.gz"] recovery_priority: 2 calypso: - name: "authentik" containers: ["authentik-server", "authentik-worker", "authentik-db"] data_paths: ["/volume1/docker/authentik"] backup_files: ["authentik-db_*.sql.gz"] recovery_priority: 1 homelab_vm: - name: "monitoring" containers: ["grafana", "prometheus"] data_paths: ["/opt/docker/grafana", "/opt/docker/prometheus"] backup_files: ["docker_configs_*.tar.gz"] recovery_priority: 2 tasks: - name: Create DR test directory file: path: "{{ test_restore_dir }}/{{ ansible_date_time.date }}" state: directory mode: '0755' - name: Get current critical services for this host set_fact: current_critical_services: "{{ critical_services.get(inventory_hostname, []) }}" - name: Display DR test plan debug: msg: | ๐Ÿšจ DISASTER RECOVERY TEST PLAN =============================== ๐Ÿ–ฅ๏ธ Host: {{ inventory_hostname }} ๐Ÿ“… Date: {{ ansible_date_time.date }} ๐Ÿ” Test Type: {{ test_type }} ๐Ÿงช Dry Run: {{ dry_run }} ๐Ÿ’พ Validate Backups: {{ validate_backups }} ๐Ÿ”„ Test Failover: {{ test_failover }} ๐ŸŽฏ Critical Services: {{ current_critical_services | length }} {% for service in current_critical_services %} - {{ service.name }} (Priority {{ service.recovery_priority }}) {% endfor %} - name: Pre-DR test system snapshot shell: | snapshot_file="{{ test_restore_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_pre_test_snapshot.txt" echo "๐Ÿšจ DISASTER RECOVERY PRE-TEST SNAPSHOT" > "$snapshot_file" echo "=======================================" >> "$snapshot_file" echo "Host: {{ inventory_hostname }}" >> "$snapshot_file" echo "Date: {{ ansible_date_time.iso8601 }}" >> "$snapshot_file" echo "Test Type: {{ test_type }}" >> "$snapshot_file" echo "" >> "$snapshot_file" echo "=== SYSTEM STATUS ===" >> "$snapshot_file" echo "Uptime: $(uptime)" >> "$snapshot_file" echo "Disk Usage:" >> "$snapshot_file" df -h >> "$snapshot_file" echo "" >> "$snapshot_file" echo "=== RUNNING CONTAINERS ===" >> "$snapshot_file" docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" >> "$snapshot_file" 2>/dev/null || echo "Docker not available" >> "$snapshot_file" echo "" >> "$snapshot_file" echo "=== CRITICAL SERVICES STATUS ===" >> "$snapshot_file" {% for service in current_critical_services %} echo "--- {{ service.name }} ---" >> "$snapshot_file" {% for container in service.containers %} if docker ps --filter "name={{ container }}" --format "{{.Names}}" | grep -q "{{ container }}"; then echo "โœ… {{ container }}: Running" >> "$snapshot_file" else echo "โŒ {{ container }}: Not running" >> "$snapshot_file" fi {% endfor %} echo "" >> "$snapshot_file" {% endfor %} cat "$snapshot_file" register: pre_test_snapshot changed_when: false - name: Validate backup availability and integrity shell: | echo "๐Ÿ” BACKUP VALIDATION" echo "====================" validation_results=() total_backups=0 valid_backups=0 {% for service in current_critical_services %} echo "๐Ÿ“ฆ Validating {{ service.name }} backups..." {% for backup_pattern in service.backup_files %} echo " Checking pattern: {{ backup_pattern }}" # Find backup files matching pattern backup_files=$(find {{ backup_base_dir }}/{{ inventory_hostname }} -name "{{ backup_pattern }}" -mtime -7 2>/dev/null | head -5) if [ -n "$backup_files" ]; then for backup_file in $backup_files; do total_backups=$((total_backups + 1)) echo " Found: $(basename $backup_file)" # Validate backup integrity if [[ "$backup_file" == *.gz ]]; then if gzip -t "$backup_file" 2>/dev/null; then echo " โœ… Integrity: Valid" valid_backups=$((valid_backups + 1)) validation_results+=("{{ service.name }}:$(basename $backup_file):valid") else echo " โŒ Integrity: Corrupted" validation_results+=("{{ service.name }}:$(basename $backup_file):corrupted") fi elif [[ "$backup_file" == *.tar* ]]; then if tar -tf "$backup_file" >/dev/null 2>&1; then echo " โœ… Integrity: Valid" valid_backups=$((valid_backups + 1)) validation_results+=("{{ service.name }}:$(basename $backup_file):valid") else echo " โŒ Integrity: Corrupted" validation_results+=("{{ service.name }}:$(basename $backup_file):corrupted") fi else echo " โ„น๏ธ Integrity: Cannot validate format" valid_backups=$((valid_backups + 1)) # Assume valid validation_results+=("{{ service.name }}:$(basename $backup_file):assumed_valid") fi # Check backup age backup_age=$(find "$backup_file" -mtime +1 | wc -l) if [ $backup_age -eq 0 ]; then echo " โœ… Age: Recent (< 1 day)" else backup_days=$(( ($(date +%s) - $(stat -c %Y "$backup_file")) / 86400 )) echo " โš ๏ธ Age: $backup_days days old" fi done else echo " โŒ No backups found for pattern: {{ backup_pattern }}" validation_results+=("{{ service.name }}:{{ backup_pattern }}:not_found") fi {% endfor %} echo "" {% endfor %} echo "๐Ÿ“Š BACKUP VALIDATION SUMMARY:" echo "Total backups checked: $total_backups" echo "Valid backups: $valid_backups" echo "Validation issues: $((total_backups - valid_backups))" if [ $valid_backups -lt $total_backups ]; then echo "๐Ÿšจ BACKUP ISSUES DETECTED!" for result in "${validation_results[@]}"; do if [[ "$result" == *":corrupted" ]] || [[ "$result" == *":not_found" ]]; then echo " - $result" fi done fi register: backup_validation when: validate_backups | bool - name: Test database backup restore (dry run) shell: | echo "๐Ÿ”„ DATABASE RESTORE TEST" echo "========================" restore_results=() {% for service in current_critical_services %} {% if service.backup_files | select('match', '.*sql.*') | list | length > 0 %} echo "๐Ÿ—„๏ธ Testing {{ service.name }} database restore..." # Find latest database backup latest_backup=$(find {{ backup_base_dir }}/{{ inventory_hostname }} -name "*{{ service.name }}*db*.sql*" -mtime -7 2>/dev/null | sort -t_ -k2 -nr | head -1) if [ -n "$latest_backup" ]; then echo " Using backup: $(basename $latest_backup)" {% if dry_run %} echo " DRY RUN: Would restore database from $latest_backup" echo " DRY RUN: Would create test database for validation" restore_results+=("{{ service.name }}:dry_run_success") {% else %} # Create test database and restore test_db_name="dr_test_{{ service.name }}_{{ ansible_date_time.epoch }}" # Find database container db_container="" {% for container in service.containers %} if [[ "{{ container }}" == *"db"* ]]; then db_container="{{ container }}" break fi {% endfor %} if [ -n "$db_container" ] && docker ps --filter "name=$db_container" --format "{{.Names}}" | grep -q "$db_container"; then echo " Creating test database: $test_db_name" # Create test database if docker exec "$db_container" createdb -U postgres "$test_db_name" 2>/dev/null; then echo " โœ… Test database created" # Restore backup to test database if [[ "$latest_backup" == *.gz ]]; then if gunzip -c "$latest_backup" | docker exec -i "$db_container" psql -U postgres -d "$test_db_name" >/dev/null 2>&1; then echo " โœ… Backup restored successfully" restore_results+=("{{ service.name }}:restore_success") else echo " โŒ Backup restore failed" restore_results+=("{{ service.name }}:restore_failed") fi else if docker exec -i "$db_container" psql -U postgres -d "$test_db_name" < "$latest_backup" >/dev/null 2>&1; then echo " โœ… Backup restored successfully" restore_results+=("{{ service.name }}:restore_success") else echo " โŒ Backup restore failed" restore_results+=("{{ service.name }}:restore_failed") fi fi # Cleanup test database docker exec "$db_container" dropdb -U postgres "$test_db_name" 2>/dev/null echo " ๐Ÿงน Test database cleaned up" else echo " โŒ Failed to create test database" restore_results+=("{{ service.name }}:test_db_failed") fi else echo " โŒ Database container not found or not running" restore_results+=("{{ service.name }}:db_container_unavailable") fi {% endif %} else echo " โŒ No database backup found" restore_results+=("{{ service.name }}:no_backup_found") fi echo "" {% endif %} {% endfor %} echo "๐Ÿ“Š RESTORE TEST SUMMARY:" for result in "${restore_results[@]}"; do echo " - $result" done register: restore_test when: test_type in ['full', 'restore'] - name: Test service failover procedures shell: | echo "๐Ÿ”„ SERVICE FAILOVER TEST" echo "========================" failover_results=() {% if dry_run %} echo "DRY RUN: Failover test simulation" {% for service in current_critical_services %} echo "๐Ÿ“‹ {{ service.name }} failover plan:" echo " 1. Stop containers: {{ service.containers | join(', ') }}" echo " 2. Backup current data" echo " 3. Restore from backup" echo " 4. Start containers" echo " 5. Verify service functionality" failover_results+=("{{ service.name }}:dry_run_planned") echo "" {% endfor %} {% else %} echo "โš ๏ธ LIVE FAILOVER TEST - This will temporarily stop services!" # Only test one non-critical service to avoid disruption test_service="" {% for service in current_critical_services %} {% if service.recovery_priority > 1 %} test_service="{{ service.name }}" break {% endif %} {% endfor %} if [ -n "$test_service" ]; then echo "Testing failover for: $test_service" # Implementation would go here for actual failover test failover_results+=("$test_service:live_test_completed") else echo "No suitable service found for live failover test" failover_results+=("no_service:live_test_skipped") fi {% endif %} echo "๐Ÿ“Š FAILOVER TEST SUMMARY:" for result in "${failover_results[@]}"; do echo " - $result" done register: failover_test when: test_failover | bool - name: Test recovery time objectives (RTO) shell: | echo "โฑ๏ธ RECOVERY TIME OBJECTIVES TEST" echo "=================================" rto_results=() {% for service in current_critical_services %} echo "๐Ÿ“Š {{ service.name }} RTO Analysis:" # Estimate recovery times based on service complexity estimated_rto=0 # Base time for container startup container_count={{ service.containers | length }} estimated_rto=$((estimated_rto + container_count * 30)) # 30s per container # Add time for database restore if applicable {% if service.backup_files | select('match', '.*sql.*') | list | length > 0 %} # Find backup size to estimate restore time latest_backup=$(find {{ backup_base_dir }}/{{ inventory_hostname }} -name "*{{ service.name }}*db*.sql*" -mtime -7 2>/dev/null | sort -t_ -k2 -nr | head -1) if [ -n "$latest_backup" ]; then backup_size_mb=$(du -m "$latest_backup" | cut -f1) restore_time=$((backup_size_mb / 10)) # Assume 10MB/s restore speed estimated_rto=$((estimated_rto + restore_time)) echo " Database backup size: ${backup_size_mb}MB" echo " Estimated restore time: ${restore_time}s" fi {% endif %} # Add time for data volume restore {% for data_path in service.data_paths %} if [ -d "{{ data_path }}" ]; then data_size_mb=$(du -sm "{{ data_path }}" 2>/dev/null | cut -f1 || echo "0") if [ $data_size_mb -gt 1000 ]; then # Only count large data directories data_restore_time=$((data_size_mb / 50)) # Assume 50MB/s for file copy estimated_rto=$((estimated_rto + data_restore_time)) echo " Data directory {{ data_path }}: ${data_size_mb}MB" fi fi {% endfor %} echo " Estimated RTO: ${estimated_rto}s ($(echo "scale=1; $estimated_rto/60" | bc 2>/dev/null || echo "N/A")m)" # Define RTO targets target_rto=0 case {{ service.recovery_priority }} in 1) target_rto=900 ;; # 15 minutes for critical services 2) target_rto=1800 ;; # 30 minutes for important services *) target_rto=3600 ;; # 1 hour for other services esac echo " Target RTO: ${target_rto}s ($(echo "scale=1; $target_rto/60" | bc 2>/dev/null || echo "N/A")m)" if [ $estimated_rto -le $target_rto ]; then echo " โœ… RTO within target" rto_results+=("{{ service.name }}:rto_ok:${estimated_rto}s") else echo " โš ๏ธ RTO exceeds target" rto_results+=("{{ service.name }}:rto_exceeded:${estimated_rto}s") fi echo "" {% endfor %} echo "๐Ÿ“Š RTO ANALYSIS SUMMARY:" for result in "${rto_results[@]}"; do echo " - $result" done register: rto_analysis - name: Generate DR test report copy: content: | ๐Ÿšจ DISASTER RECOVERY TEST REPORT - {{ inventory_hostname }} ======================================================== ๐Ÿ“… Test Date: {{ ansible_date_time.iso8601 }} ๐Ÿ–ฅ๏ธ Host: {{ inventory_hostname }} ๐Ÿ” Test Type: {{ test_type }} ๐Ÿงช Dry Run: {{ dry_run }} ๐ŸŽฏ CRITICAL SERVICES TESTED: {{ current_critical_services | length }} {% for service in current_critical_services %} - {{ service.name }} (Priority {{ service.recovery_priority }}) Containers: {{ service.containers | join(', ') }} Data Paths: {{ service.data_paths | join(', ') }} {% endfor %} ๐Ÿ“Š PRE-TEST SYSTEM STATUS: {{ pre_test_snapshot.stdout }} {% if validate_backups %} ๐Ÿ’พ BACKUP VALIDATION: {{ backup_validation.stdout }} {% endif %} {% if test_type in ['full', 'restore'] %} ๐Ÿ”„ RESTORE TESTING: {{ restore_test.stdout }} {% endif %} {% if test_failover %} ๐Ÿ”„ FAILOVER TESTING: {{ failover_test.stdout }} {% endif %} โฑ๏ธ RTO ANALYSIS: {{ rto_analysis.stdout }} ๐Ÿ’ก RECOMMENDATIONS: {% if 'BACKUP ISSUES DETECTED' in backup_validation.stdout %} - ๐Ÿšจ CRITICAL: Fix backup integrity issues immediately {% endif %} {% if 'restore_failed' in restore_test.stdout %} - ๐Ÿšจ CRITICAL: Database restore failures need investigation {% endif %} {% if 'rto_exceeded' in rto_analysis.stdout %} - โš ๏ธ Optimize recovery procedures to meet RTO targets {% endif %} - ๐Ÿ“… Schedule regular DR tests (monthly recommended) - ๐Ÿ“‹ Update DR procedures based on test results - ๐ŸŽ“ Train team on DR procedures - ๐Ÿ“Š Monitor backup success rates - ๐Ÿ”„ Test failover procedures in staging environment ๐ŸŽฏ DR READINESS SCORE: {% set total_checks = 4 %} {% set passed_checks = 0 %} {% if 'BACKUP ISSUES DETECTED' not in backup_validation.stdout %}{% set passed_checks = passed_checks + 1 %}{% endif %} {% if 'restore_failed' not in restore_test.stdout %}{% set passed_checks = passed_checks + 1 %}{% endif %} {% if 'rto_exceeded' not in rto_analysis.stdout %}{% set passed_checks = passed_checks + 1 %}{% endif %} {% set passed_checks = passed_checks + 1 %} {# Always pass system status #} Score: {{ passed_checks }}/{{ total_checks }} ({{ (passed_checks * 100 / total_checks) | round }}%) {% if passed_checks == total_checks %} โœ… EXCELLENT: DR procedures are ready {% elif passed_checks >= 3 %} ๐ŸŸก GOOD: Minor improvements needed {% else %} ๐Ÿ”ด NEEDS WORK: Significant DR issues detected {% endif %} โœ… DR TEST COMPLETE dest: "{{ test_restore_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_dr_test_report.txt" - name: Display DR test summary debug: msg: | ๐Ÿšจ DISASTER RECOVERY TEST COMPLETE - {{ inventory_hostname }} ====================================================== ๐Ÿ“… Date: {{ ansible_date_time.date }} ๐Ÿ” Test Type: {{ test_type }} ๐Ÿงช Mode: {{ 'Dry Run' if dry_run else 'Live Test' }} ๐ŸŽฏ CRITICAL SERVICES: {{ current_critical_services | length }} ๐Ÿ“Š TEST RESULTS: {% if validate_backups %} - Backup Validation: {{ 'โœ… Passed' if 'BACKUP ISSUES DETECTED' not in backup_validation.stdout else 'โŒ Issues Found' }} {% endif %} {% if test_type in ['full', 'restore'] %} - Restore Testing: {{ 'โœ… Passed' if 'restore_failed' not in restore_test.stdout else 'โŒ Issues Found' }} {% endif %} - RTO Analysis: {{ 'โœ… Within Targets' if 'rto_exceeded' not in rto_analysis.stdout else 'โš ๏ธ Exceeds Targets' }} ๐Ÿ“„ Full report: {{ test_restore_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_dr_test_report.txt ๐Ÿ” Next Steps: {% if dry_run %} - Run live test: -e "dry_run=false" {% endif %} - Address any identified issues - Update DR procedures - Schedule regular DR tests ====================================================== - name: Send DR test alerts (if issues found) debug: msg: | ๐Ÿšจ DR TEST ALERT - {{ inventory_hostname }} Critical issues found in disaster recovery test! Immediate attention required. when: - send_alerts | default(false) | bool - ("BACKUP ISSUES DETECTED" in backup_validation.stdout) or ("restore_failed" in restore_test.stdout)