Files
homelab-optimized/ansible/automation/playbooks/disaster_recovery_orchestrator.yml
Gitea Mirror Bot e03072e1ec
Some checks failed
Documentation / Deploy to GitHub Pages (push) Has been cancelled
Documentation / Build Docusaurus (push) Has been cancelled
Sanitized mirror from private repository - 2026-04-19 08:30:52 UTC
2026-04-19 08:30:52 +00:00

511 lines
18 KiB
YAML

---
# Disaster Recovery Orchestrator
# Full infrastructure backup and recovery procedures
# Run with: ansible-playbook -i hosts.ini playbooks/disaster_recovery_orchestrator.yml
- name: Disaster Recovery Orchestrator
hosts: all
gather_facts: yes
vars:
dr_backup_root: "/volume1/disaster-recovery"
recovery_priority_tiers:
tier_1_critical:
- "postgres"
- "mariadb"
- "authentik-server"
- "nginx-proxy-manager"
- "portainer"
tier_2_infrastructure:
- "prometheus"
- "grafana"
- "gitea"
- "adguard"
- "tailscale"
tier_3_services:
- "plex"
- "immich-server"
- "paperlessngx"
- "vaultwarden"
tier_4_optional:
- "sonarr"
- "radarr"
- "jellyseerr"
- "homarr"
backup_retention:
daily: 7
weekly: 4
monthly: 12
tasks:
- name: Create disaster recovery directory structure
file:
path: "{{ dr_backup_root }}/{{ item }}"
state: directory
mode: '0755'
loop:
- "configs"
- "databases"
- "volumes"
- "system"
- "recovery-plans"
- "verification"
when: inventory_hostname in groups['synology']
become: yes
- name: Generate system inventory
shell: |
echo "=== System Inventory for {{ inventory_hostname }} ==="
echo "Timestamp: $(date)"
echo "Hostname: $(hostname)"
echo "IP Address: {{ ansible_default_ipv4.address }}"
echo "OS: {{ ansible_facts['os_family'] }} {{ ansible_facts['distribution_version'] }}"
echo ""
echo "=== Hardware Information ==="
echo "CPU: $(nproc) cores"
echo "Memory: $(free -h | grep '^Mem:' | awk '{print $2}')"
echo "Disk Usage:"
df -h | grep -E '^/dev|^tmpfs' | head -10
echo ""
echo "=== Network Configuration ==="
ip addr show | grep -E '^[0-9]+:|inet ' | head -20
echo ""
echo "=== Running Services ==="
if command -v systemctl >/dev/null 2>&1; then
systemctl list-units --type=service --state=running | head -20
fi
echo ""
echo "=== Docker Containers ==="
if command -v docker >/dev/null 2>&1; then
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" | head -20
fi
register: system_inventory
- name: Backup critical configurations
shell: |
backup_date=$(date +%Y%m%d_%H%M%S)
config_backup="{{ dr_backup_root }}/configs/{{ inventory_hostname }}_configs_${backup_date}.tar.gz"
echo "Creating configuration backup: $config_backup"
# Create list of critical config paths
config_paths=""
# System configs
[ -d /etc ] && config_paths="$config_paths /etc/hosts /etc/hostname /etc/fstab /etc/crontab"
[ -d /etc/systemd ] && config_paths="$config_paths /etc/systemd/system"
[ -d /etc/nginx ] && config_paths="$config_paths /etc/nginx"
[ -d /etc/docker ] && config_paths="$config_paths /etc/docker"
# Docker compose files
if [ -d /volume1/docker ]; then
find /volume1/docker -name "docker-compose.yml" -o -name "*.env" > /tmp/docker_configs.txt
config_paths="$config_paths $(cat /tmp/docker_configs.txt | tr '\n' ' ')"
fi
# SSH configs
[ -d /root/.ssh ] && config_paths="$config_paths /root/.ssh"
[ -d /home/*/.ssh ] && config_paths="$config_paths /home/*/.ssh"
# Create backup
if [ -n "$config_paths" ]; then
tar -czf "$config_backup" $config_paths 2>/dev/null || true
if [ -f "$config_backup" ]; then
size=$(du -h "$config_backup" | cut -f1)
echo "✓ Configuration backup created: $size"
else
echo "✗ Configuration backup failed"
fi
else
echo "No configuration paths found"
fi
register: config_backup
when: inventory_hostname in groups['synology']
become: yes
- name: Backup databases with consistency checks
shell: |
backup_date=$(date +%Y%m%d_%H%M%S)
db_backup_dir="{{ dr_backup_root }}/databases/{{ inventory_hostname }}_${backup_date}"
mkdir -p "$db_backup_dir"
echo "=== Database Backup for {{ inventory_hostname }} ==="
# PostgreSQL databases
for container in $(docker ps --filter "ancestor=postgres" --format "{{.Names}}" 2>/dev/null); do
echo "Backing up PostgreSQL container: $container"
# Create backup
docker exec "$container" pg_dumpall -U postgres > "${db_backup_dir}/${container}_postgres.sql" 2>/dev/null
# Verify backup
if [ -s "${db_backup_dir}/${container}_postgres.sql" ]; then
lines=$(wc -l < "${db_backup_dir}/${container}_postgres.sql")
size=$(du -h "${db_backup_dir}/${container}_postgres.sql" | cut -f1)
echo "✓ $container: $lines lines, $size"
# Test restore (dry run)
if docker exec "$container" psql -U postgres -c "SELECT version();" >/dev/null 2>&1; then
echo "✓ $container: Database connection verified"
else
echo "✗ $container: Database connection failed"
fi
else
echo "✗ $container: Backup failed or empty"
fi
done
# MariaDB/MySQL databases
for container in $(docker ps --filter "ancestor=mariadb" --format "{{.Names}}" 2>/dev/null); do
echo "Backing up MariaDB container: $container"
docker exec "$container" mysqldump --all-databases -u root > "${db_backup_dir}/${container}_mariadb.sql" 2>/dev/null
if [ -s "${db_backup_dir}/${container}_mariadb.sql" ]; then
lines=$(wc -l < "${db_backup_dir}/${container}_mariadb.sql")
size=$(du -h "${db_backup_dir}/${container}_mariadb.sql" | cut -f1)
echo "✓ $container: $lines lines, $size"
else
echo "✗ $container: Backup failed or empty"
fi
done
# MongoDB databases
for container in $(docker ps --filter "ancestor=mongo" --format "{{.Names}}" 2>/dev/null); do
echo "Backing up MongoDB container: $container"
docker exec "$container" mongodump --archive > "${db_backup_dir}/${container}_mongodb.archive" 2>/dev/null
if [ -s "${db_backup_dir}/${container}_mongodb.archive" ]; then
size=$(du -h "${db_backup_dir}/${container}_mongodb.archive" | cut -f1)
echo "✓ $container: $size"
else
echo "✗ $container: Backup failed or empty"
fi
done
echo "Database backup completed: $db_backup_dir"
register: database_backup
when: inventory_hostname in groups['synology']
become: yes
- name: Create recovery plan document
copy:
content: |
# Disaster Recovery Plan - {{ inventory_hostname }}
Generated: {{ ansible_date_time.iso8601 }}
## System Information
- Hostname: {{ inventory_hostname }}
- IP Address: {{ ansible_default_ipv4.address }}
- OS: {{ ansible_facts['os_family'] }} {{ ansible_facts['distribution_version'] }}
- Groups: {{ group_names | join(', ') }}
## Recovery Priority Order
### Tier 1 - Critical Infrastructure (Start First)
{% for service in recovery_priority_tiers.tier_1_critical %}
- {{ service }}
{% endfor %}
### Tier 2 - Core Infrastructure
{% for service in recovery_priority_tiers.tier_2_infrastructure %}
- {{ service }}
{% endfor %}
### Tier 3 - Applications
{% for service in recovery_priority_tiers.tier_3_services %}
- {{ service }}
{% endfor %}
### Tier 4 - Optional Services
{% for service in recovery_priority_tiers.tier_4_optional %}
- {{ service }}
{% endfor %}
## Recovery Procedures
### 1. System Recovery
```bash
# Restore system configurations
tar -xzf {{ dr_backup_root }}/configs/{{ inventory_hostname }}_configs_*.tar.gz -C /
# Restart essential services
systemctl restart docker
systemctl restart tailscaled
```
### 2. Database Recovery
```bash
# PostgreSQL restore example
docker exec -i <postgres_container> psql -U postgres < backup.sql
# MariaDB restore example
docker exec -i <mariadb_container> mysql -u root < backup.sql
# MongoDB restore example
docker exec -i <mongo_container> mongorestore --archive < backup.archive
```
### 3. Container Recovery
```bash
# Pull latest images
docker-compose pull
# Start containers in priority order
docker-compose up -d <tier_1_services>
# Wait for health checks, then continue with tier 2, etc.
```
## Verification Steps
### Health Checks
- [ ] All critical containers running
- [ ] Database connections working
- [ ] Web interfaces accessible
- [ ] Monitoring systems operational
- [ ] Backup systems functional
### Network Connectivity
- [ ] Tailscale mesh connected
- [ ] DNS resolution working
- [ ] External services accessible
- [ ] Inter-container communication working
## Emergency Contacts & Resources
### Key Services URLs
{% if inventory_hostname == 'atlantis' %}
- Portainer: https://192.168.0.200:9443
- Plex: http://{{ ansible_default_ipv4.address }}:32400
- Immich: http://{{ ansible_default_ipv4.address }}:2283
{% elif inventory_hostname == 'calypso' %}
- Gitea: https://git.vish.gg
- Authentik: https://auth.vish.gg
- Paperless: http://{{ ansible_default_ipv4.address }}:8000
{% endif %}
### Documentation
- Repository: https://git.vish.gg/Vish/homelab
- Ansible Playbooks: /home/homelab/organized/repos/homelab/ansible/automation/
- Monitoring: https://gf.vish.gg
## Backup Locations
- Configurations: {{ dr_backup_root }}/configs/
- Databases: {{ dr_backup_root }}/databases/
- Docker Volumes: {{ dr_backup_root }}/volumes/
- System State: {{ dr_backup_root }}/system/
dest: "{{ dr_backup_root }}/recovery-plans/{{ inventory_hostname }}_recovery_plan.md"
when: inventory_hostname in groups['synology']
become: yes
- name: Test disaster recovery procedures (dry run)
shell: |
echo "=== Disaster Recovery Test - {{ inventory_hostname }} ==="
echo "Timestamp: $(date)"
echo ""
echo "=== Backup Verification ==="
# Check configuration backups
config_backups=$(find {{ dr_backup_root }}/configs -name "{{ inventory_hostname }}_configs_*.tar.gz" 2>/dev/null | wc -l)
echo "Configuration backups: $config_backups"
# Check database backups
db_backups=$(find {{ dr_backup_root }}/databases -name "{{ inventory_hostname }}_*" -type d 2>/dev/null | wc -l)
echo "Database backup sets: $db_backups"
echo ""
echo "=== Recovery Readiness ==="
# Check if Docker is available
if command -v docker >/dev/null 2>&1; then
echo "✓ Docker available"
# Check if compose files exist
compose_files=$(find /volume1/docker -name "docker-compose.yml" 2>/dev/null | wc -l)
echo "✓ Docker Compose files: $compose_files"
else
echo "✗ Docker not available"
fi
# Check Tailscale
if command -v tailscale >/dev/null 2>&1; then
echo "✓ Tailscale available"
else
echo "✗ Tailscale not available"
fi
# Check network connectivity
if ping -c 1 8.8.8.8 >/dev/null 2>&1; then
echo "✓ Internet connectivity"
else
echo "✗ No internet connectivity"
fi
echo ""
echo "=== Critical Service Status ==="
{% for tier_name, services in recovery_priority_tiers.items() %}
echo "{{ tier_name | replace('_', ' ') | title }}:"
{% for service in services %}
if docker ps --filter "name={{ service }}" --format "{{.Names}}" | grep -q "{{ service }}"; then
echo " ✓ {{ service }}"
else
echo " ✗ {{ service }}"
fi
{% endfor %}
echo ""
{% endfor %}
register: dr_test
when: inventory_hostname in groups['synology']
become: yes
- name: Generate disaster recovery report
copy:
content: |
# Disaster Recovery Report - {{ inventory_hostname }}
Generated: {{ ansible_date_time.iso8601 }}
## System Inventory
```
{{ system_inventory.stdout }}
```
## Configuration Backup
```
{{ config_backup.stdout if config_backup is defined else 'Not performed on this host' }}
```
## Database Backup
```
{{ database_backup.stdout if database_backup is defined else 'Not performed on this host' }}
```
## Recovery Readiness Test
```
{{ dr_test.stdout if dr_test is defined else 'Not performed on this host' }}
```
## Recommendations
{% if inventory_hostname in groups['synology'] %}
### For {{ inventory_hostname }}:
- ✅ Primary backup location configured
- ✅ Recovery plan generated
- 🔧 Schedule regular DR tests
- 🔧 Verify off-site backup replication
{% else %}
### For {{ inventory_hostname }}:
- 🔧 Configure local backup procedures
- 🔧 Ensure critical data is replicated to Synology hosts
- 🔧 Document service-specific recovery steps
{% endif %}
## Next Steps
1. Review recovery plan: {{ dr_backup_root }}/recovery-plans/{{ inventory_hostname }}_recovery_plan.md
2. Test recovery procedures in non-production environment
3. Schedule regular backup verification
4. Update recovery documentation as services change
dest: "/tmp/disaster_recovery_{{ inventory_hostname }}_{{ ansible_date_time.epoch }}.md"
delegate_to: localhost
- name: Display disaster recovery summary
debug:
msg: |
Disaster Recovery Summary for {{ inventory_hostname }}:
- System Inventory: ✅ Complete
- Configuration Backup: {{ '✅ Complete' if config_backup is defined else '⏭️ Skipped (not Synology)' }}
- Database Backup: {{ '✅ Complete' if database_backup is defined else '⏭️ Skipped (not Synology)' }}
- Recovery Plan: {{ '✅ Generated' if inventory_hostname in groups['synology'] else '⏭️ Host-specific plan needed' }}
- Report: /tmp/disaster_recovery_{{ inventory_hostname }}_{{ ansible_date_time.epoch }}.md
# Final consolidation task
- name: Generate Master Disaster Recovery Plan
hosts: localhost
gather_facts: no
tasks:
- name: Create master recovery plan
shell: |
echo "# Master Disaster Recovery Plan - Homelab Infrastructure"
echo "Generated: $(date)"
echo ""
echo "## Infrastructure Overview"
echo "- Total Hosts: {{ groups['all'] | length }}"
echo "- Synology NAS: {{ groups['synology'] | length }}"
echo "- Debian Clients: {{ groups['debian_clients'] | length }}"
echo "- Hypervisors: {{ groups['hypervisors'] | length }}"
echo ""
echo "## Recovery Order by Host"
echo ""
echo "### Phase 1: Core Infrastructure"
{% for host in groups['synology'] %}
echo "1. **{{ host }}** - Primary storage and services"
{% endfor %}
echo ""
echo "### Phase 2: Compute Nodes"
{% for host in groups['debian_clients'] %}
echo "2. **{{ host }}** - Applications and services"
{% endfor %}
echo ""
echo "### Phase 3: Specialized Systems"
{% for host in groups['hypervisors'] %}
echo "3. **{{ host }}** - Virtualization and specialized services"
{% endfor %}
echo ""
echo "## Critical Recovery Procedures"
echo ""
echo "### 1. Network Recovery"
echo "- Restore Tailscale mesh connectivity"
echo "- Verify DNS resolution (AdGuard Home)"
echo "- Test inter-host communication"
echo ""
echo "### 2. Storage Recovery"
echo "- Mount all required volumes"
echo "- Verify RAID integrity on Synology systems"
echo "- Test backup accessibility"
echo ""
echo "### 3. Service Recovery"
echo "- Start Tier 1 services (databases, auth)"
echo "- Start Tier 2 services (core infrastructure)"
echo "- Start Tier 3 services (applications)"
echo "- Start Tier 4 services (optional)"
echo ""
echo "## Verification Checklist"
echo "- [ ] All hosts accessible via Tailscale"
echo "- [ ] All critical containers running"
echo "- [ ] Monitoring systems operational"
echo "- [ ] Backup systems functional"
echo "- [ ] User services accessible"
echo ""
echo "## Emergency Resources"
echo "- Repository: https://git.vish.gg/Vish/homelab"
echo "- Ansible Playbooks: /home/homelab/organized/repos/homelab/ansible/automation/"
echo "- Individual Host Reports: /tmp/disaster_recovery_*.md"
register: master_plan
- name: Save master disaster recovery plan
copy:
content: "{{ master_plan.stdout }}"
dest: "/tmp/master_disaster_recovery_plan_{{ ansible_date_time.epoch }}.md"
- name: Display final summary
debug:
msg: |
🚨 Disaster Recovery Orchestration Complete!
📋 Generated Reports:
- Master Plan: /tmp/master_disaster_recovery_plan_{{ ansible_date_time.epoch }}.md
- Individual Reports: /tmp/disaster_recovery_*.md
- Recovery Plans: {{ dr_backup_root }}/recovery-plans/ (on Synology hosts)
🔧 Next Steps:
1. Review the master disaster recovery plan
2. Test recovery procedures in a safe environment
3. Schedule regular DR drills
4. Keep recovery documentation updated