Sanitized mirror from private repository - 2026-03-11 06:48:12 UTC
Some checks failed
Documentation / Build Docusaurus (push) Failing after 8s
Documentation / Deploy to GitHub Pages (push) Has been skipped

This commit is contained in:
Gitea Mirror Bot
2026-03-11 06:48:12 +00:00
commit 7f27e8d941
1169 changed files with 299869 additions and 0 deletions

View File

@@ -0,0 +1,521 @@
---
# Disaster Recovery Test Playbook
# Test disaster recovery procedures and validate backup integrity
# Usage: ansible-playbook playbooks/disaster_recovery_test.yml
# Usage: ansible-playbook playbooks/disaster_recovery_test.yml -e "test_type=full"
# Usage: ansible-playbook playbooks/disaster_recovery_test.yml -e "dry_run=true"
- name: Disaster Recovery Test and Validation
hosts: "{{ host_target | default('all') }}"
gather_facts: yes
vars:
test_type: "{{ test_type | default('basic') }}" # basic, full, restore
dry_run: "{{ dry_run | default(true) }}"
backup_base_dir: "/volume1/backups"
test_restore_dir: "/tmp/dr_test"
validate_backups: "{{ validate_backups | default(true) }}"
test_failover: "{{ test_failover | default(false) }}"
# Critical services for DR testing
critical_services:
atlantis:
- name: "immich"
containers: ["immich-server", "immich-db", "immich-redis"]
data_paths: ["/volume1/docker/immich"]
backup_files: ["immich-db_*.sql.gz"]
recovery_priority: 1
- name: "vaultwarden"
containers: ["vaultwarden", "vaultwarden-db"]
data_paths: ["/volume1/docker/vaultwarden"]
backup_files: ["vaultwarden-db_*.sql.gz"]
recovery_priority: 1
- name: "plex"
containers: ["plex"]
data_paths: ["/volume1/docker/plex"]
backup_files: ["docker_configs_*.tar.gz"]
recovery_priority: 2
calypso:
- name: "authentik"
containers: ["authentik-server", "authentik-worker", "authentik-db"]
data_paths: ["/volume1/docker/authentik"]
backup_files: ["authentik-db_*.sql.gz"]
recovery_priority: 1
homelab_vm:
- name: "monitoring"
containers: ["grafana", "prometheus"]
data_paths: ["/opt/docker/grafana", "/opt/docker/prometheus"]
backup_files: ["docker_configs_*.tar.gz"]
recovery_priority: 2
tasks:
- name: Create DR test directory
file:
path: "{{ test_restore_dir }}/{{ ansible_date_time.date }}"
state: directory
mode: '0755'
- name: Get current critical services for this host
set_fact:
current_critical_services: "{{ critical_services.get(inventory_hostname, []) }}"
- name: Display DR test plan
debug:
msg: |
🚨 DISASTER RECOVERY TEST PLAN
===============================
🖥️ Host: {{ inventory_hostname }}
📅 Date: {{ ansible_date_time.date }}
🔍 Test Type: {{ test_type }}
🧪 Dry Run: {{ dry_run }}
💾 Validate Backups: {{ validate_backups }}
🔄 Test Failover: {{ test_failover }}
🎯 Critical Services: {{ current_critical_services | length }}
{% for service in current_critical_services %}
- {{ service.name }} (Priority {{ service.recovery_priority }})
{% endfor %}
- name: Pre-DR test system snapshot
shell: |
snapshot_file="{{ test_restore_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_pre_test_snapshot.txt"
echo "🚨 DISASTER RECOVERY PRE-TEST SNAPSHOT" > "$snapshot_file"
echo "=======================================" >> "$snapshot_file"
echo "Host: {{ inventory_hostname }}" >> "$snapshot_file"
echo "Date: {{ ansible_date_time.iso8601 }}" >> "$snapshot_file"
echo "Test Type: {{ test_type }}" >> "$snapshot_file"
echo "" >> "$snapshot_file"
echo "=== SYSTEM STATUS ===" >> "$snapshot_file"
echo "Uptime: $(uptime)" >> "$snapshot_file"
echo "Disk Usage:" >> "$snapshot_file"
df -h >> "$snapshot_file"
echo "" >> "$snapshot_file"
echo "=== RUNNING CONTAINERS ===" >> "$snapshot_file"
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" >> "$snapshot_file" 2>/dev/null || echo "Docker not available" >> "$snapshot_file"
echo "" >> "$snapshot_file"
echo "=== CRITICAL SERVICES STATUS ===" >> "$snapshot_file"
{% for service in current_critical_services %}
echo "--- {{ service.name }} ---" >> "$snapshot_file"
{% for container in service.containers %}
if docker ps --filter "name={{ container }}" --format "{{.Names}}" | grep -q "{{ container }}"; then
echo "✅ {{ container }}: Running" >> "$snapshot_file"
else
echo "❌ {{ container }}: Not running" >> "$snapshot_file"
fi
{% endfor %}
echo "" >> "$snapshot_file"
{% endfor %}
cat "$snapshot_file"
register: pre_test_snapshot
changed_when: false
- name: Validate backup availability and integrity
shell: |
echo "🔍 BACKUP VALIDATION"
echo "===================="
validation_results=()
total_backups=0
valid_backups=0
{% for service in current_critical_services %}
echo "📦 Validating {{ service.name }} backups..."
{% for backup_pattern in service.backup_files %}
echo " Checking pattern: {{ backup_pattern }}"
# Find backup files matching pattern
backup_files=$(find {{ backup_base_dir }}/{{ inventory_hostname }} -name "{{ backup_pattern }}" -mtime -7 2>/dev/null | head -5)
if [ -n "$backup_files" ]; then
for backup_file in $backup_files; do
total_backups=$((total_backups + 1))
echo " Found: $(basename $backup_file)"
# Validate backup integrity
if [[ "$backup_file" == *.gz ]]; then
if gzip -t "$backup_file" 2>/dev/null; then
echo " ✅ Integrity: Valid"
valid_backups=$((valid_backups + 1))
validation_results+=("{{ service.name }}:$(basename $backup_file):valid")
else
echo " ❌ Integrity: Corrupted"
validation_results+=("{{ service.name }}:$(basename $backup_file):corrupted")
fi
elif [[ "$backup_file" == *.tar* ]]; then
if tar -tf "$backup_file" >/dev/null 2>&1; then
echo " ✅ Integrity: Valid"
valid_backups=$((valid_backups + 1))
validation_results+=("{{ service.name }}:$(basename $backup_file):valid")
else
echo " ❌ Integrity: Corrupted"
validation_results+=("{{ service.name }}:$(basename $backup_file):corrupted")
fi
else
echo " Integrity: Cannot validate format"
valid_backups=$((valid_backups + 1)) # Assume valid
validation_results+=("{{ service.name }}:$(basename $backup_file):assumed_valid")
fi
# Check backup age
backup_age=$(find "$backup_file" -mtime +1 | wc -l)
if [ $backup_age -eq 0 ]; then
echo " ✅ Age: Recent (< 1 day)"
else
backup_days=$(( ($(date +%s) - $(stat -c %Y "$backup_file")) / 86400 ))
echo " ⚠️ Age: $backup_days days old"
fi
done
else
echo " ❌ No backups found for pattern: {{ backup_pattern }}"
validation_results+=("{{ service.name }}:{{ backup_pattern }}:not_found")
fi
{% endfor %}
echo ""
{% endfor %}
echo "📊 BACKUP VALIDATION SUMMARY:"
echo "Total backups checked: $total_backups"
echo "Valid backups: $valid_backups"
echo "Validation issues: $((total_backups - valid_backups))"
if [ $valid_backups -lt $total_backups ]; then
echo "🚨 BACKUP ISSUES DETECTED!"
for result in "${validation_results[@]}"; do
if [[ "$result" == *":corrupted" ]] || [[ "$result" == *":not_found" ]]; then
echo " - $result"
fi
done
fi
register: backup_validation
when: validate_backups | bool
- name: Test database backup restore (dry run)
shell: |
echo "🔄 DATABASE RESTORE TEST"
echo "========================"
restore_results=()
{% for service in current_critical_services %}
{% if service.backup_files | select('match', '.*sql.*') | list | length > 0 %}
echo "🗄️ Testing {{ service.name }} database restore..."
# Find latest database backup
latest_backup=$(find {{ backup_base_dir }}/{{ inventory_hostname }} -name "*{{ service.name }}*db*.sql*" -mtime -7 2>/dev/null | sort -t_ -k2 -nr | head -1)
if [ -n "$latest_backup" ]; then
echo " Using backup: $(basename $latest_backup)"
{% if dry_run %}
echo " DRY RUN: Would restore database from $latest_backup"
echo " DRY RUN: Would create test database for validation"
restore_results+=("{{ service.name }}:dry_run_success")
{% else %}
# Create test database and restore
test_db_name="dr_test_{{ service.name }}_{{ ansible_date_time.epoch }}"
# Find database container
db_container=""
{% for container in service.containers %}
if [[ "{{ container }}" == *"db"* ]]; then
db_container="{{ container }}"
break
fi
{% endfor %}
if [ -n "$db_container" ] && docker ps --filter "name=$db_container" --format "{{.Names}}" | grep -q "$db_container"; then
echo " Creating test database: $test_db_name"
# Create test database
if docker exec "$db_container" createdb -U postgres "$test_db_name" 2>/dev/null; then
echo " ✅ Test database created"
# Restore backup to test database
if [[ "$latest_backup" == *.gz ]]; then
if gunzip -c "$latest_backup" | docker exec -i "$db_container" psql -U postgres -d "$test_db_name" >/dev/null 2>&1; then
echo " ✅ Backup restored successfully"
restore_results+=("{{ service.name }}:restore_success")
else
echo " ❌ Backup restore failed"
restore_results+=("{{ service.name }}:restore_failed")
fi
else
if docker exec -i "$db_container" psql -U postgres -d "$test_db_name" < "$latest_backup" >/dev/null 2>&1; then
echo " ✅ Backup restored successfully"
restore_results+=("{{ service.name }}:restore_success")
else
echo " ❌ Backup restore failed"
restore_results+=("{{ service.name }}:restore_failed")
fi
fi
# Cleanup test database
docker exec "$db_container" dropdb -U postgres "$test_db_name" 2>/dev/null
echo " 🧹 Test database cleaned up"
else
echo " ❌ Failed to create test database"
restore_results+=("{{ service.name }}:test_db_failed")
fi
else
echo " ❌ Database container not found or not running"
restore_results+=("{{ service.name }}:db_container_unavailable")
fi
{% endif %}
else
echo " ❌ No database backup found"
restore_results+=("{{ service.name }}:no_backup_found")
fi
echo ""
{% endif %}
{% endfor %}
echo "📊 RESTORE TEST SUMMARY:"
for result in "${restore_results[@]}"; do
echo " - $result"
done
register: restore_test
when: test_type in ['full', 'restore']
- name: Test service failover procedures
shell: |
echo "🔄 SERVICE FAILOVER TEST"
echo "========================"
failover_results=()
{% if dry_run %}
echo "DRY RUN: Failover test simulation"
{% for service in current_critical_services %}
echo "📋 {{ service.name }} failover plan:"
echo " 1. Stop containers: {{ service.containers | join(', ') }}"
echo " 2. Backup current data"
echo " 3. Restore from backup"
echo " 4. Start containers"
echo " 5. Verify service functionality"
failover_results+=("{{ service.name }}:dry_run_planned")
echo ""
{% endfor %}
{% else %}
echo "⚠️ LIVE FAILOVER TEST - This will temporarily stop services!"
# Only test one non-critical service to avoid disruption
test_service=""
{% for service in current_critical_services %}
{% if service.recovery_priority > 1 %}
test_service="{{ service.name }}"
break
{% endif %}
{% endfor %}
if [ -n "$test_service" ]; then
echo "Testing failover for: $test_service"
# Implementation would go here for actual failover test
failover_results+=("$test_service:live_test_completed")
else
echo "No suitable service found for live failover test"
failover_results+=("no_service:live_test_skipped")
fi
{% endif %}
echo "📊 FAILOVER TEST SUMMARY:"
for result in "${failover_results[@]}"; do
echo " - $result"
done
register: failover_test
when: test_failover | bool
- name: Test recovery time objectives (RTO)
shell: |
echo "⏱️ RECOVERY TIME OBJECTIVES TEST"
echo "================================="
rto_results=()
{% for service in current_critical_services %}
echo "📊 {{ service.name }} RTO Analysis:"
# Estimate recovery times based on service complexity
estimated_rto=0
# Base time for container startup
container_count={{ service.containers | length }}
estimated_rto=$((estimated_rto + container_count * 30)) # 30s per container
# Add time for database restore if applicable
{% if service.backup_files | select('match', '.*sql.*') | list | length > 0 %}
# Find backup size to estimate restore time
latest_backup=$(find {{ backup_base_dir }}/{{ inventory_hostname }} -name "*{{ service.name }}*db*.sql*" -mtime -7 2>/dev/null | sort -t_ -k2 -nr | head -1)
if [ -n "$latest_backup" ]; then
backup_size_mb=$(du -m "$latest_backup" | cut -f1)
restore_time=$((backup_size_mb / 10)) # Assume 10MB/s restore speed
estimated_rto=$((estimated_rto + restore_time))
echo " Database backup size: ${backup_size_mb}MB"
echo " Estimated restore time: ${restore_time}s"
fi
{% endif %}
# Add time for data volume restore
{% for data_path in service.data_paths %}
if [ -d "{{ data_path }}" ]; then
data_size_mb=$(du -sm "{{ data_path }}" 2>/dev/null | cut -f1 || echo "0")
if [ $data_size_mb -gt 1000 ]; then # Only count large data directories
data_restore_time=$((data_size_mb / 50)) # Assume 50MB/s for file copy
estimated_rto=$((estimated_rto + data_restore_time))
echo " Data directory {{ data_path }}: ${data_size_mb}MB"
fi
fi
{% endfor %}
echo " Estimated RTO: ${estimated_rto}s ($(echo "scale=1; $estimated_rto/60" | bc 2>/dev/null || echo "N/A")m)"
# Define RTO targets
target_rto=0
case {{ service.recovery_priority }} in
1) target_rto=900 ;; # 15 minutes for critical services
2) target_rto=1800 ;; # 30 minutes for important services
*) target_rto=3600 ;; # 1 hour for other services
esac
echo " Target RTO: ${target_rto}s ($(echo "scale=1; $target_rto/60" | bc 2>/dev/null || echo "N/A")m)"
if [ $estimated_rto -le $target_rto ]; then
echo " ✅ RTO within target"
rto_results+=("{{ service.name }}:rto_ok:${estimated_rto}s")
else
echo " ⚠️ RTO exceeds target"
rto_results+=("{{ service.name }}:rto_exceeded:${estimated_rto}s")
fi
echo ""
{% endfor %}
echo "📊 RTO ANALYSIS SUMMARY:"
for result in "${rto_results[@]}"; do
echo " - $result"
done
register: rto_analysis
- name: Generate DR test report
copy:
content: |
🚨 DISASTER RECOVERY TEST REPORT - {{ inventory_hostname }}
========================================================
📅 Test Date: {{ ansible_date_time.iso8601 }}
🖥️ Host: {{ inventory_hostname }}
🔍 Test Type: {{ test_type }}
🧪 Dry Run: {{ dry_run }}
🎯 CRITICAL SERVICES TESTED: {{ current_critical_services | length }}
{% for service in current_critical_services %}
- {{ service.name }} (Priority {{ service.recovery_priority }})
Containers: {{ service.containers | join(', ') }}
Data Paths: {{ service.data_paths | join(', ') }}
{% endfor %}
📊 PRE-TEST SYSTEM STATUS:
{{ pre_test_snapshot.stdout }}
{% if validate_backups %}
💾 BACKUP VALIDATION:
{{ backup_validation.stdout }}
{% endif %}
{% if test_type in ['full', 'restore'] %}
🔄 RESTORE TESTING:
{{ restore_test.stdout }}
{% endif %}
{% if test_failover %}
🔄 FAILOVER TESTING:
{{ failover_test.stdout }}
{% endif %}
⏱️ RTO ANALYSIS:
{{ rto_analysis.stdout }}
💡 RECOMMENDATIONS:
{% if 'BACKUP ISSUES DETECTED' in backup_validation.stdout %}
- 🚨 CRITICAL: Fix backup integrity issues immediately
{% endif %}
{% if 'restore_failed' in restore_test.stdout %}
- 🚨 CRITICAL: Database restore failures need investigation
{% endif %}
{% if 'rto_exceeded' in rto_analysis.stdout %}
- ⚠️ Optimize recovery procedures to meet RTO targets
{% endif %}
- 📅 Schedule regular DR tests (monthly recommended)
- 📋 Update DR procedures based on test results
- 🎓 Train team on DR procedures
- 📊 Monitor backup success rates
- 🔄 Test failover procedures in staging environment
🎯 DR READINESS SCORE:
{% set total_checks = 4 %}
{% set passed_checks = 0 %}
{% if 'BACKUP ISSUES DETECTED' not in backup_validation.stdout %}{% set passed_checks = passed_checks + 1 %}{% endif %}
{% if 'restore_failed' not in restore_test.stdout %}{% set passed_checks = passed_checks + 1 %}{% endif %}
{% if 'rto_exceeded' not in rto_analysis.stdout %}{% set passed_checks = passed_checks + 1 %}{% endif %}
{% set passed_checks = passed_checks + 1 %} {# Always pass system status #}
Score: {{ passed_checks }}/{{ total_checks }} ({{ (passed_checks * 100 / total_checks) | round }}%)
{% if passed_checks == total_checks %}
✅ EXCELLENT: DR procedures are ready
{% elif passed_checks >= 3 %}
🟡 GOOD: Minor improvements needed
{% else %}
🔴 NEEDS WORK: Significant DR issues detected
{% endif %}
✅ DR TEST COMPLETE
dest: "{{ test_restore_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_dr_test_report.txt"
- name: Display DR test summary
debug:
msg: |
🚨 DISASTER RECOVERY TEST COMPLETE - {{ inventory_hostname }}
======================================================
📅 Date: {{ ansible_date_time.date }}
🔍 Test Type: {{ test_type }}
🧪 Mode: {{ 'Dry Run' if dry_run else 'Live Test' }}
🎯 CRITICAL SERVICES: {{ current_critical_services | length }}
📊 TEST RESULTS:
{% if validate_backups %}
- Backup Validation: {{ '✅ Passed' if 'BACKUP ISSUES DETECTED' not in backup_validation.stdout else '❌ Issues Found' }}
{% endif %}
{% if test_type in ['full', 'restore'] %}
- Restore Testing: {{ '✅ Passed' if 'restore_failed' not in restore_test.stdout else '❌ Issues Found' }}
{% endif %}
- RTO Analysis: {{ '✅ Within Targets' if 'rto_exceeded' not in rto_analysis.stdout else '⚠️ Exceeds Targets' }}
📄 Full report: {{ test_restore_dir }}/{{ ansible_date_time.date }}/{{ inventory_hostname }}_dr_test_report.txt
🔍 Next Steps:
{% if dry_run %}
- Run live test: -e "dry_run=false"
{% endif %}
- Address any identified issues
- Update DR procedures
- Schedule regular DR tests
======================================================
- name: Send DR test alerts (if issues found)
debug:
msg: |
🚨 DR TEST ALERT - {{ inventory_hostname }}
Critical issues found in disaster recovery test!
Immediate attention required.
when:
- send_alerts | default(false) | bool
- ("BACKUP ISSUES DETECTED" in backup_validation.stdout) or ("restore_failed" in restore_test.stdout)