--- # Disaster Recovery Orchestrator # Full infrastructure backup and recovery procedures # Run with: ansible-playbook -i hosts.ini playbooks/disaster_recovery_orchestrator.yml - name: Disaster Recovery Orchestrator hosts: all gather_facts: yes vars: dr_backup_root: "/volume1/disaster-recovery" recovery_priority_tiers: tier_1_critical: - "postgres" - "mariadb" - "authentik-server" - "nginx-proxy-manager" - "portainer" tier_2_infrastructure: - "prometheus" - "grafana" - "gitea" - "adguard" - "tailscale" tier_3_services: - "plex" - "immich-server" - "paperlessngx" - "vaultwarden" tier_4_optional: - "sonarr" - "radarr" - "jellyseerr" - "homarr" backup_retention: daily: 7 weekly: 4 monthly: 12 tasks: - name: Create disaster recovery directory structure file: path: "{{ dr_backup_root }}/{{ item }}" state: directory mode: '0755' loop: - "configs" - "databases" - "volumes" - "system" - "recovery-plans" - "verification" when: inventory_hostname in groups['synology'] become: yes - name: Generate system inventory shell: | echo "=== System Inventory for {{ inventory_hostname }} ===" echo "Timestamp: $(date)" echo "Hostname: $(hostname)" echo "IP Address: {{ ansible_default_ipv4.address }}" echo "OS: {{ ansible_facts['os_family'] }} {{ ansible_facts['distribution_version'] }}" echo "" echo "=== Hardware Information ===" echo "CPU: $(nproc) cores" echo "Memory: $(free -h | grep '^Mem:' | awk '{print $2}')" echo "Disk Usage:" df -h | grep -E '^/dev|^tmpfs' | head -10 echo "" echo "=== Network Configuration ===" ip addr show | grep -E '^[0-9]+:|inet ' | head -20 echo "" echo "=== Running Services ===" if command -v systemctl >/dev/null 2>&1; then systemctl list-units --type=service --state=running | head -20 fi echo "" echo "=== Docker Containers ===" if command -v docker >/dev/null 2>&1; then docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" | head -20 fi register: system_inventory - name: Backup critical configurations shell: | backup_date=$(date +%Y%m%d_%H%M%S) config_backup="{{ dr_backup_root }}/configs/{{ inventory_hostname }}_configs_${backup_date}.tar.gz" echo "Creating configuration backup: $config_backup" # Create list of critical config paths config_paths="" # System configs [ -d /etc ] && config_paths="$config_paths /etc/hosts /etc/hostname /etc/fstab /etc/crontab" [ -d /etc/systemd ] && config_paths="$config_paths /etc/systemd/system" [ -d /etc/nginx ] && config_paths="$config_paths /etc/nginx" [ -d /etc/docker ] && config_paths="$config_paths /etc/docker" # Docker compose files if [ -d /volume1/docker ]; then find /volume1/docker -name "docker-compose.yml" -o -name "*.env" > /tmp/docker_configs.txt config_paths="$config_paths $(cat /tmp/docker_configs.txt | tr '\n' ' ')" fi # SSH configs [ -d /root/.ssh ] && config_paths="$config_paths /root/.ssh" [ -d /home/*/.ssh ] && config_paths="$config_paths /home/*/.ssh" # Create backup if [ -n "$config_paths" ]; then tar -czf "$config_backup" $config_paths 2>/dev/null || true if [ -f "$config_backup" ]; then size=$(du -h "$config_backup" | cut -f1) echo "✓ Configuration backup created: $size" else echo "✗ Configuration backup failed" fi else echo "No configuration paths found" fi register: config_backup when: inventory_hostname in groups['synology'] become: yes - name: Backup databases with consistency checks shell: | backup_date=$(date +%Y%m%d_%H%M%S) db_backup_dir="{{ dr_backup_root }}/databases/{{ inventory_hostname }}_${backup_date}" mkdir -p "$db_backup_dir" echo "=== Database Backup for {{ inventory_hostname }} ===" # PostgreSQL databases for container in $(docker ps --filter "ancestor=postgres" --format "{{.Names}}" 2>/dev/null); do echo "Backing up PostgreSQL container: $container" # Create backup docker exec "$container" pg_dumpall -U postgres > "${db_backup_dir}/${container}_postgres.sql" 2>/dev/null # Verify backup if [ -s "${db_backup_dir}/${container}_postgres.sql" ]; then lines=$(wc -l < "${db_backup_dir}/${container}_postgres.sql") size=$(du -h "${db_backup_dir}/${container}_postgres.sql" | cut -f1) echo "✓ $container: $lines lines, $size" # Test restore (dry run) if docker exec "$container" psql -U postgres -c "SELECT version();" >/dev/null 2>&1; then echo "✓ $container: Database connection verified" else echo "✗ $container: Database connection failed" fi else echo "✗ $container: Backup failed or empty" fi done # MariaDB/MySQL databases for container in $(docker ps --filter "ancestor=mariadb" --format "{{.Names}}" 2>/dev/null); do echo "Backing up MariaDB container: $container" docker exec "$container" mysqldump --all-databases -u root > "${db_backup_dir}/${container}_mariadb.sql" 2>/dev/null if [ -s "${db_backup_dir}/${container}_mariadb.sql" ]; then lines=$(wc -l < "${db_backup_dir}/${container}_mariadb.sql") size=$(du -h "${db_backup_dir}/${container}_mariadb.sql" | cut -f1) echo "✓ $container: $lines lines, $size" else echo "✗ $container: Backup failed or empty" fi done # MongoDB databases for container in $(docker ps --filter "ancestor=mongo" --format "{{.Names}}" 2>/dev/null); do echo "Backing up MongoDB container: $container" docker exec "$container" mongodump --archive > "${db_backup_dir}/${container}_mongodb.archive" 2>/dev/null if [ -s "${db_backup_dir}/${container}_mongodb.archive" ]; then size=$(du -h "${db_backup_dir}/${container}_mongodb.archive" | cut -f1) echo "✓ $container: $size" else echo "✗ $container: Backup failed or empty" fi done echo "Database backup completed: $db_backup_dir" register: database_backup when: inventory_hostname in groups['synology'] become: yes - name: Create recovery plan document copy: content: | # Disaster Recovery Plan - {{ inventory_hostname }} Generated: {{ ansible_date_time.iso8601 }} ## System Information - Hostname: {{ inventory_hostname }} - IP Address: {{ ansible_default_ipv4.address }} - OS: {{ ansible_facts['os_family'] }} {{ ansible_facts['distribution_version'] }} - Groups: {{ group_names | join(', ') }} ## Recovery Priority Order ### Tier 1 - Critical Infrastructure (Start First) {% for service in recovery_priority_tiers.tier_1_critical %} - {{ service }} {% endfor %} ### Tier 2 - Core Infrastructure {% for service in recovery_priority_tiers.tier_2_infrastructure %} - {{ service }} {% endfor %} ### Tier 3 - Applications {% for service in recovery_priority_tiers.tier_3_services %} - {{ service }} {% endfor %} ### Tier 4 - Optional Services {% for service in recovery_priority_tiers.tier_4_optional %} - {{ service }} {% endfor %} ## Recovery Procedures ### 1. System Recovery ```bash # Restore system configurations tar -xzf {{ dr_backup_root }}/configs/{{ inventory_hostname }}_configs_*.tar.gz -C / # Restart essential services systemctl restart docker systemctl restart tailscaled ``` ### 2. Database Recovery ```bash # PostgreSQL restore example docker exec -i psql -U postgres < backup.sql # MariaDB restore example docker exec -i mysql -u root < backup.sql # MongoDB restore example docker exec -i mongorestore --archive < backup.archive ``` ### 3. Container Recovery ```bash # Pull latest images docker-compose pull # Start containers in priority order docker-compose up -d # Wait for health checks, then continue with tier 2, etc. ``` ## Verification Steps ### Health Checks - [ ] All critical containers running - [ ] Database connections working - [ ] Web interfaces accessible - [ ] Monitoring systems operational - [ ] Backup systems functional ### Network Connectivity - [ ] Tailscale mesh connected - [ ] DNS resolution working - [ ] External services accessible - [ ] Inter-container communication working ## Emergency Contacts & Resources ### Key Services URLs {% if inventory_hostname == 'atlantis' %} - Portainer: https://192.168.0.200:9443 - Plex: http://{{ ansible_default_ipv4.address }}:32400 - Immich: http://{{ ansible_default_ipv4.address }}:2283 {% elif inventory_hostname == 'calypso' %} - Gitea: https://git.vish.gg - Authentik: https://auth.vish.gg - Paperless: http://{{ ansible_default_ipv4.address }}:8000 {% endif %} ### Documentation - Repository: https://git.vish.gg/Vish/homelab - Ansible Playbooks: /home/homelab/organized/repos/homelab/ansible/automation/ - Monitoring: https://gf.vish.gg ## Backup Locations - Configurations: {{ dr_backup_root }}/configs/ - Databases: {{ dr_backup_root }}/databases/ - Docker Volumes: {{ dr_backup_root }}/volumes/ - System State: {{ dr_backup_root }}/system/ dest: "{{ dr_backup_root }}/recovery-plans/{{ inventory_hostname }}_recovery_plan.md" when: inventory_hostname in groups['synology'] become: yes - name: Test disaster recovery procedures (dry run) shell: | echo "=== Disaster Recovery Test - {{ inventory_hostname }} ===" echo "Timestamp: $(date)" echo "" echo "=== Backup Verification ===" # Check configuration backups config_backups=$(find {{ dr_backup_root }}/configs -name "{{ inventory_hostname }}_configs_*.tar.gz" 2>/dev/null | wc -l) echo "Configuration backups: $config_backups" # Check database backups db_backups=$(find {{ dr_backup_root }}/databases -name "{{ inventory_hostname }}_*" -type d 2>/dev/null | wc -l) echo "Database backup sets: $db_backups" echo "" echo "=== Recovery Readiness ===" # Check if Docker is available if command -v docker >/dev/null 2>&1; then echo "✓ Docker available" # Check if compose files exist compose_files=$(find /volume1/docker -name "docker-compose.yml" 2>/dev/null | wc -l) echo "✓ Docker Compose files: $compose_files" else echo "✗ Docker not available" fi # Check Tailscale if command -v tailscale >/dev/null 2>&1; then echo "✓ Tailscale available" else echo "✗ Tailscale not available" fi # Check network connectivity if ping -c 1 8.8.8.8 >/dev/null 2>&1; then echo "✓ Internet connectivity" else echo "✗ No internet connectivity" fi echo "" echo "=== Critical Service Status ===" {% for tier_name, services in recovery_priority_tiers.items() %} echo "{{ tier_name | replace('_', ' ') | title }}:" {% for service in services %} if docker ps --filter "name={{ service }}" --format "{{.Names}}" | grep -q "{{ service }}"; then echo " ✓ {{ service }}" else echo " ✗ {{ service }}" fi {% endfor %} echo "" {% endfor %} register: dr_test when: inventory_hostname in groups['synology'] become: yes - name: Generate disaster recovery report copy: content: | # Disaster Recovery Report - {{ inventory_hostname }} Generated: {{ ansible_date_time.iso8601 }} ## System Inventory ``` {{ system_inventory.stdout }} ``` ## Configuration Backup ``` {{ config_backup.stdout if config_backup is defined else 'Not performed on this host' }} ``` ## Database Backup ``` {{ database_backup.stdout if database_backup is defined else 'Not performed on this host' }} ``` ## Recovery Readiness Test ``` {{ dr_test.stdout if dr_test is defined else 'Not performed on this host' }} ``` ## Recommendations {% if inventory_hostname in groups['synology'] %} ### For {{ inventory_hostname }}: - ✅ Primary backup location configured - ✅ Recovery plan generated - 🔧 Schedule regular DR tests - 🔧 Verify off-site backup replication {% else %} ### For {{ inventory_hostname }}: - 🔧 Configure local backup procedures - 🔧 Ensure critical data is replicated to Synology hosts - 🔧 Document service-specific recovery steps {% endif %} ## Next Steps 1. Review recovery plan: {{ dr_backup_root }}/recovery-plans/{{ inventory_hostname }}_recovery_plan.md 2. Test recovery procedures in non-production environment 3. Schedule regular backup verification 4. Update recovery documentation as services change dest: "/tmp/disaster_recovery_{{ inventory_hostname }}_{{ ansible_date_time.epoch }}.md" delegate_to: localhost - name: Display disaster recovery summary debug: msg: | Disaster Recovery Summary for {{ inventory_hostname }}: - System Inventory: ✅ Complete - Configuration Backup: {{ '✅ Complete' if config_backup is defined else '⏭️ Skipped (not Synology)' }} - Database Backup: {{ '✅ Complete' if database_backup is defined else '⏭️ Skipped (not Synology)' }} - Recovery Plan: {{ '✅ Generated' if inventory_hostname in groups['synology'] else '⏭️ Host-specific plan needed' }} - Report: /tmp/disaster_recovery_{{ inventory_hostname }}_{{ ansible_date_time.epoch }}.md # Final consolidation task - name: Generate Master Disaster Recovery Plan hosts: localhost gather_facts: no tasks: - name: Create master recovery plan shell: | echo "# Master Disaster Recovery Plan - Homelab Infrastructure" echo "Generated: $(date)" echo "" echo "## Infrastructure Overview" echo "- Total Hosts: {{ groups['all'] | length }}" echo "- Synology NAS: {{ groups['synology'] | length }}" echo "- Debian Clients: {{ groups['debian_clients'] | length }}" echo "- Hypervisors: {{ groups['hypervisors'] | length }}" echo "" echo "## Recovery Order by Host" echo "" echo "### Phase 1: Core Infrastructure" {% for host in groups['synology'] %} echo "1. **{{ host }}** - Primary storage and services" {% endfor %} echo "" echo "### Phase 2: Compute Nodes" {% for host in groups['debian_clients'] %} echo "2. **{{ host }}** - Applications and services" {% endfor %} echo "" echo "### Phase 3: Specialized Systems" {% for host in groups['hypervisors'] %} echo "3. **{{ host }}** - Virtualization and specialized services" {% endfor %} echo "" echo "## Critical Recovery Procedures" echo "" echo "### 1. Network Recovery" echo "- Restore Tailscale mesh connectivity" echo "- Verify DNS resolution (AdGuard Home)" echo "- Test inter-host communication" echo "" echo "### 2. Storage Recovery" echo "- Mount all required volumes" echo "- Verify RAID integrity on Synology systems" echo "- Test backup accessibility" echo "" echo "### 3. Service Recovery" echo "- Start Tier 1 services (databases, auth)" echo "- Start Tier 2 services (core infrastructure)" echo "- Start Tier 3 services (applications)" echo "- Start Tier 4 services (optional)" echo "" echo "## Verification Checklist" echo "- [ ] All hosts accessible via Tailscale" echo "- [ ] All critical containers running" echo "- [ ] Monitoring systems operational" echo "- [ ] Backup systems functional" echo "- [ ] User services accessible" echo "" echo "## Emergency Resources" echo "- Repository: https://git.vish.gg/Vish/homelab" echo "- Ansible Playbooks: /home/homelab/organized/repos/homelab/ansible/automation/" echo "- Individual Host Reports: /tmp/disaster_recovery_*.md" register: master_plan - name: Save master disaster recovery plan copy: content: "{{ master_plan.stdout }}" dest: "/tmp/master_disaster_recovery_plan_{{ ansible_date_time.epoch }}.md" - name: Display final summary debug: msg: | 🚨 Disaster Recovery Orchestration Complete! 📋 Generated Reports: - Master Plan: /tmp/master_disaster_recovery_plan_{{ ansible_date_time.epoch }}.md - Individual Reports: /tmp/disaster_recovery_*.md - Recovery Plans: {{ dr_backup_root }}/recovery-plans/ (on Synology hosts) 🔧 Next Steps: 1. Review the master disaster recovery plan 2. Test recovery procedures in a safe environment 3. Schedule regular DR drills 4. Keep recovery documentation updated