511 lines
18 KiB
YAML
511 lines
18 KiB
YAML
---
|
|
# Disaster Recovery Orchestrator
|
|
# Full infrastructure backup and recovery procedures
|
|
# Run with: ansible-playbook -i hosts.ini playbooks/disaster_recovery_orchestrator.yml
|
|
|
|
- name: Disaster Recovery Orchestrator
|
|
hosts: all
|
|
gather_facts: yes
|
|
vars:
|
|
dr_backup_root: "/volume1/disaster-recovery"
|
|
recovery_priority_tiers:
|
|
tier_1_critical:
|
|
- "postgres"
|
|
- "mariadb"
|
|
- "authentik-server"
|
|
- "nginx-proxy-manager"
|
|
- "portainer"
|
|
tier_2_infrastructure:
|
|
- "prometheus"
|
|
- "grafana"
|
|
- "gitea"
|
|
- "adguard"
|
|
- "tailscale"
|
|
tier_3_services:
|
|
- "plex"
|
|
- "immich-server"
|
|
- "paperlessngx"
|
|
- "vaultwarden"
|
|
tier_4_optional:
|
|
- "sonarr"
|
|
- "radarr"
|
|
- "jellyseerr"
|
|
- "homarr"
|
|
|
|
backup_retention:
|
|
daily: 7
|
|
weekly: 4
|
|
monthly: 12
|
|
|
|
tasks:
|
|
- name: Create disaster recovery directory structure
|
|
file:
|
|
path: "{{ dr_backup_root }}/{{ item }}"
|
|
state: directory
|
|
mode: '0755'
|
|
loop:
|
|
- "configs"
|
|
- "databases"
|
|
- "volumes"
|
|
- "system"
|
|
- "recovery-plans"
|
|
- "verification"
|
|
when: inventory_hostname in groups['synology']
|
|
become: yes
|
|
|
|
- name: Generate system inventory
|
|
shell: |
|
|
echo "=== System Inventory for {{ inventory_hostname }} ==="
|
|
echo "Timestamp: $(date)"
|
|
echo "Hostname: $(hostname)"
|
|
echo "IP Address: {{ ansible_default_ipv4.address }}"
|
|
echo "OS: {{ ansible_facts['os_family'] }} {{ ansible_facts['distribution_version'] }}"
|
|
echo ""
|
|
|
|
echo "=== Hardware Information ==="
|
|
echo "CPU: $(nproc) cores"
|
|
echo "Memory: $(free -h | grep '^Mem:' | awk '{print $2}')"
|
|
echo "Disk Usage:"
|
|
df -h | grep -E '^/dev|^tmpfs' | head -10
|
|
echo ""
|
|
|
|
echo "=== Network Configuration ==="
|
|
ip addr show | grep -E '^[0-9]+:|inet ' | head -20
|
|
echo ""
|
|
|
|
echo "=== Running Services ==="
|
|
if command -v systemctl >/dev/null 2>&1; then
|
|
systemctl list-units --type=service --state=running | head -20
|
|
fi
|
|
echo ""
|
|
|
|
echo "=== Docker Containers ==="
|
|
if command -v docker >/dev/null 2>&1; then
|
|
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Image}}" | head -20
|
|
fi
|
|
register: system_inventory
|
|
|
|
- name: Backup critical configurations
|
|
shell: |
|
|
backup_date=$(date +%Y%m%d_%H%M%S)
|
|
config_backup="{{ dr_backup_root }}/configs/{{ inventory_hostname }}_configs_${backup_date}.tar.gz"
|
|
|
|
echo "Creating configuration backup: $config_backup"
|
|
|
|
# Create list of critical config paths
|
|
config_paths=""
|
|
|
|
# System configs
|
|
[ -d /etc ] && config_paths="$config_paths /etc/hosts /etc/hostname /etc/fstab /etc/crontab"
|
|
[ -d /etc/systemd ] && config_paths="$config_paths /etc/systemd/system"
|
|
[ -d /etc/nginx ] && config_paths="$config_paths /etc/nginx"
|
|
[ -d /etc/docker ] && config_paths="$config_paths /etc/docker"
|
|
|
|
# Docker compose files
|
|
if [ -d /volume1/docker ]; then
|
|
find /volume1/docker -name "docker-compose.yml" -o -name "*.env" > /tmp/docker_configs.txt
|
|
config_paths="$config_paths $(cat /tmp/docker_configs.txt | tr '\n' ' ')"
|
|
fi
|
|
|
|
# SSH configs
|
|
[ -d /root/.ssh ] && config_paths="$config_paths /root/.ssh"
|
|
[ -d /home/*/.ssh ] && config_paths="$config_paths /home/*/.ssh"
|
|
|
|
# Create backup
|
|
if [ -n "$config_paths" ]; then
|
|
tar -czf "$config_backup" $config_paths 2>/dev/null || true
|
|
if [ -f "$config_backup" ]; then
|
|
size=$(du -h "$config_backup" | cut -f1)
|
|
echo "✓ Configuration backup created: $size"
|
|
else
|
|
echo "✗ Configuration backup failed"
|
|
fi
|
|
else
|
|
echo "No configuration paths found"
|
|
fi
|
|
register: config_backup
|
|
when: inventory_hostname in groups['synology']
|
|
become: yes
|
|
|
|
- name: Backup databases with consistency checks
|
|
shell: |
|
|
backup_date=$(date +%Y%m%d_%H%M%S)
|
|
db_backup_dir="{{ dr_backup_root }}/databases/{{ inventory_hostname }}_${backup_date}"
|
|
mkdir -p "$db_backup_dir"
|
|
|
|
echo "=== Database Backup for {{ inventory_hostname }} ==="
|
|
|
|
# PostgreSQL databases
|
|
for container in $(docker ps --filter "ancestor=postgres" --format "{{.Names}}" 2>/dev/null); do
|
|
echo "Backing up PostgreSQL container: $container"
|
|
|
|
# Create backup
|
|
docker exec "$container" pg_dumpall -U postgres > "${db_backup_dir}/${container}_postgres.sql" 2>/dev/null
|
|
|
|
# Verify backup
|
|
if [ -s "${db_backup_dir}/${container}_postgres.sql" ]; then
|
|
lines=$(wc -l < "${db_backup_dir}/${container}_postgres.sql")
|
|
size=$(du -h "${db_backup_dir}/${container}_postgres.sql" | cut -f1)
|
|
echo "✓ $container: $lines lines, $size"
|
|
|
|
# Test restore (dry run)
|
|
if docker exec "$container" psql -U postgres -c "SELECT version();" >/dev/null 2>&1; then
|
|
echo "✓ $container: Database connection verified"
|
|
else
|
|
echo "✗ $container: Database connection failed"
|
|
fi
|
|
else
|
|
echo "✗ $container: Backup failed or empty"
|
|
fi
|
|
done
|
|
|
|
# MariaDB/MySQL databases
|
|
for container in $(docker ps --filter "ancestor=mariadb" --format "{{.Names}}" 2>/dev/null); do
|
|
echo "Backing up MariaDB container: $container"
|
|
|
|
docker exec "$container" mysqldump --all-databases -u root > "${db_backup_dir}/${container}_mariadb.sql" 2>/dev/null
|
|
|
|
if [ -s "${db_backup_dir}/${container}_mariadb.sql" ]; then
|
|
lines=$(wc -l < "${db_backup_dir}/${container}_mariadb.sql")
|
|
size=$(du -h "${db_backup_dir}/${container}_mariadb.sql" | cut -f1)
|
|
echo "✓ $container: $lines lines, $size"
|
|
else
|
|
echo "✗ $container: Backup failed or empty"
|
|
fi
|
|
done
|
|
|
|
# MongoDB databases
|
|
for container in $(docker ps --filter "ancestor=mongo" --format "{{.Names}}" 2>/dev/null); do
|
|
echo "Backing up MongoDB container: $container"
|
|
|
|
docker exec "$container" mongodump --archive > "${db_backup_dir}/${container}_mongodb.archive" 2>/dev/null
|
|
|
|
if [ -s "${db_backup_dir}/${container}_mongodb.archive" ]; then
|
|
size=$(du -h "${db_backup_dir}/${container}_mongodb.archive" | cut -f1)
|
|
echo "✓ $container: $size"
|
|
else
|
|
echo "✗ $container: Backup failed or empty"
|
|
fi
|
|
done
|
|
|
|
echo "Database backup completed: $db_backup_dir"
|
|
register: database_backup
|
|
when: inventory_hostname in groups['synology']
|
|
become: yes
|
|
|
|
- name: Create recovery plan document
|
|
copy:
|
|
content: |
|
|
# Disaster Recovery Plan - {{ inventory_hostname }}
|
|
Generated: {{ ansible_date_time.iso8601 }}
|
|
|
|
## System Information
|
|
- Hostname: {{ inventory_hostname }}
|
|
- IP Address: {{ ansible_default_ipv4.address }}
|
|
- OS: {{ ansible_facts['os_family'] }} {{ ansible_facts['distribution_version'] }}
|
|
- Groups: {{ group_names | join(', ') }}
|
|
|
|
## Recovery Priority Order
|
|
|
|
### Tier 1 - Critical Infrastructure (Start First)
|
|
{% for service in recovery_priority_tiers.tier_1_critical %}
|
|
- {{ service }}
|
|
{% endfor %}
|
|
|
|
### Tier 2 - Core Infrastructure
|
|
{% for service in recovery_priority_tiers.tier_2_infrastructure %}
|
|
- {{ service }}
|
|
{% endfor %}
|
|
|
|
### Tier 3 - Applications
|
|
{% for service in recovery_priority_tiers.tier_3_services %}
|
|
- {{ service }}
|
|
{% endfor %}
|
|
|
|
### Tier 4 - Optional Services
|
|
{% for service in recovery_priority_tiers.tier_4_optional %}
|
|
- {{ service }}
|
|
{% endfor %}
|
|
|
|
## Recovery Procedures
|
|
|
|
### 1. System Recovery
|
|
```bash
|
|
# Restore system configurations
|
|
tar -xzf {{ dr_backup_root }}/configs/{{ inventory_hostname }}_configs_*.tar.gz -C /
|
|
|
|
# Restart essential services
|
|
systemctl restart docker
|
|
systemctl restart tailscaled
|
|
```
|
|
|
|
### 2. Database Recovery
|
|
```bash
|
|
# PostgreSQL restore example
|
|
docker exec -i <postgres_container> psql -U postgres < backup.sql
|
|
|
|
# MariaDB restore example
|
|
docker exec -i <mariadb_container> mysql -u root < backup.sql
|
|
|
|
# MongoDB restore example
|
|
docker exec -i <mongo_container> mongorestore --archive < backup.archive
|
|
```
|
|
|
|
### 3. Container Recovery
|
|
```bash
|
|
# Pull latest images
|
|
docker-compose pull
|
|
|
|
# Start containers in priority order
|
|
docker-compose up -d <tier_1_services>
|
|
# Wait for health checks, then continue with tier 2, etc.
|
|
```
|
|
|
|
## Verification Steps
|
|
|
|
### Health Checks
|
|
- [ ] All critical containers running
|
|
- [ ] Database connections working
|
|
- [ ] Web interfaces accessible
|
|
- [ ] Monitoring systems operational
|
|
- [ ] Backup systems functional
|
|
|
|
### Network Connectivity
|
|
- [ ] Tailscale mesh connected
|
|
- [ ] DNS resolution working
|
|
- [ ] External services accessible
|
|
- [ ] Inter-container communication working
|
|
|
|
## Emergency Contacts & Resources
|
|
|
|
### Key Services URLs
|
|
{% if inventory_hostname == 'atlantis' %}
|
|
- Portainer: https://192.168.0.200:9443
|
|
- Plex: http://{{ ansible_default_ipv4.address }}:32400
|
|
- Immich: http://{{ ansible_default_ipv4.address }}:2283
|
|
{% elif inventory_hostname == 'calypso' %}
|
|
- Gitea: https://git.vish.gg
|
|
- Authentik: https://auth.vish.gg
|
|
- Paperless: http://{{ ansible_default_ipv4.address }}:8000
|
|
{% endif %}
|
|
|
|
### Documentation
|
|
- Repository: https://git.vish.gg/Vish/homelab
|
|
- Ansible Playbooks: /home/homelab/organized/repos/homelab/ansible/automation/
|
|
- Monitoring: https://gf.vish.gg
|
|
|
|
## Backup Locations
|
|
- Configurations: {{ dr_backup_root }}/configs/
|
|
- Databases: {{ dr_backup_root }}/databases/
|
|
- Docker Volumes: {{ dr_backup_root }}/volumes/
|
|
- System State: {{ dr_backup_root }}/system/
|
|
dest: "{{ dr_backup_root }}/recovery-plans/{{ inventory_hostname }}_recovery_plan.md"
|
|
when: inventory_hostname in groups['synology']
|
|
become: yes
|
|
|
|
- name: Test disaster recovery procedures (dry run)
|
|
shell: |
|
|
echo "=== Disaster Recovery Test - {{ inventory_hostname }} ==="
|
|
echo "Timestamp: $(date)"
|
|
echo ""
|
|
|
|
echo "=== Backup Verification ==="
|
|
|
|
# Check configuration backups
|
|
config_backups=$(find {{ dr_backup_root }}/configs -name "{{ inventory_hostname }}_configs_*.tar.gz" 2>/dev/null | wc -l)
|
|
echo "Configuration backups: $config_backups"
|
|
|
|
# Check database backups
|
|
db_backups=$(find {{ dr_backup_root }}/databases -name "{{ inventory_hostname }}_*" -type d 2>/dev/null | wc -l)
|
|
echo "Database backup sets: $db_backups"
|
|
|
|
echo ""
|
|
echo "=== Recovery Readiness ==="
|
|
|
|
# Check if Docker is available
|
|
if command -v docker >/dev/null 2>&1; then
|
|
echo "✓ Docker available"
|
|
|
|
# Check if compose files exist
|
|
compose_files=$(find /volume1/docker -name "docker-compose.yml" 2>/dev/null | wc -l)
|
|
echo "✓ Docker Compose files: $compose_files"
|
|
else
|
|
echo "✗ Docker not available"
|
|
fi
|
|
|
|
# Check Tailscale
|
|
if command -v tailscale >/dev/null 2>&1; then
|
|
echo "✓ Tailscale available"
|
|
else
|
|
echo "✗ Tailscale not available"
|
|
fi
|
|
|
|
# Check network connectivity
|
|
if ping -c 1 8.8.8.8 >/dev/null 2>&1; then
|
|
echo "✓ Internet connectivity"
|
|
else
|
|
echo "✗ No internet connectivity"
|
|
fi
|
|
|
|
echo ""
|
|
echo "=== Critical Service Status ==="
|
|
|
|
{% for tier_name, services in recovery_priority_tiers.items() %}
|
|
echo "{{ tier_name | replace('_', ' ') | title }}:"
|
|
{% for service in services %}
|
|
if docker ps --filter "name={{ service }}" --format "{{.Names}}" | grep -q "{{ service }}"; then
|
|
echo " ✓ {{ service }}"
|
|
else
|
|
echo " ✗ {{ service }}"
|
|
fi
|
|
{% endfor %}
|
|
echo ""
|
|
{% endfor %}
|
|
register: dr_test
|
|
when: inventory_hostname in groups['synology']
|
|
become: yes
|
|
|
|
- name: Generate disaster recovery report
|
|
copy:
|
|
content: |
|
|
# Disaster Recovery Report - {{ inventory_hostname }}
|
|
Generated: {{ ansible_date_time.iso8601 }}
|
|
|
|
## System Inventory
|
|
```
|
|
{{ system_inventory.stdout }}
|
|
```
|
|
|
|
## Configuration Backup
|
|
```
|
|
{{ config_backup.stdout if config_backup is defined else 'Not performed on this host' }}
|
|
```
|
|
|
|
## Database Backup
|
|
```
|
|
{{ database_backup.stdout if database_backup is defined else 'Not performed on this host' }}
|
|
```
|
|
|
|
## Recovery Readiness Test
|
|
```
|
|
{{ dr_test.stdout if dr_test is defined else 'Not performed on this host' }}
|
|
```
|
|
|
|
## Recommendations
|
|
|
|
{% if inventory_hostname in groups['synology'] %}
|
|
### For {{ inventory_hostname }}:
|
|
- ✅ Primary backup location configured
|
|
- ✅ Recovery plan generated
|
|
- 🔧 Schedule regular DR tests
|
|
- 🔧 Verify off-site backup replication
|
|
{% else %}
|
|
### For {{ inventory_hostname }}:
|
|
- 🔧 Configure local backup procedures
|
|
- 🔧 Ensure critical data is replicated to Synology hosts
|
|
- 🔧 Document service-specific recovery steps
|
|
{% endif %}
|
|
|
|
## Next Steps
|
|
1. Review recovery plan: {{ dr_backup_root }}/recovery-plans/{{ inventory_hostname }}_recovery_plan.md
|
|
2. Test recovery procedures in non-production environment
|
|
3. Schedule regular backup verification
|
|
4. Update recovery documentation as services change
|
|
dest: "/tmp/disaster_recovery_{{ inventory_hostname }}_{{ ansible_date_time.epoch }}.md"
|
|
delegate_to: localhost
|
|
|
|
- name: Display disaster recovery summary
|
|
debug:
|
|
msg: |
|
|
Disaster Recovery Summary for {{ inventory_hostname }}:
|
|
- System Inventory: ✅ Complete
|
|
- Configuration Backup: {{ '✅ Complete' if config_backup is defined else '⏭️ Skipped (not Synology)' }}
|
|
- Database Backup: {{ '✅ Complete' if database_backup is defined else '⏭️ Skipped (not Synology)' }}
|
|
- Recovery Plan: {{ '✅ Generated' if inventory_hostname in groups['synology'] else '⏭️ Host-specific plan needed' }}
|
|
- Report: /tmp/disaster_recovery_{{ inventory_hostname }}_{{ ansible_date_time.epoch }}.md
|
|
|
|
# Final consolidation task
|
|
- name: Generate Master Disaster Recovery Plan
|
|
hosts: localhost
|
|
gather_facts: no
|
|
tasks:
|
|
- name: Create master recovery plan
|
|
shell: |
|
|
echo "# Master Disaster Recovery Plan - Homelab Infrastructure"
|
|
echo "Generated: $(date)"
|
|
echo ""
|
|
echo "## Infrastructure Overview"
|
|
echo "- Total Hosts: {{ groups['all'] | length }}"
|
|
echo "- Synology NAS: {{ groups['synology'] | length }}"
|
|
echo "- Debian Clients: {{ groups['debian_clients'] | length }}"
|
|
echo "- Hypervisors: {{ groups['hypervisors'] | length }}"
|
|
echo ""
|
|
echo "## Recovery Order by Host"
|
|
echo ""
|
|
echo "### Phase 1: Core Infrastructure"
|
|
{% for host in groups['synology'] %}
|
|
echo "1. **{{ host }}** - Primary storage and services"
|
|
{% endfor %}
|
|
echo ""
|
|
echo "### Phase 2: Compute Nodes"
|
|
{% for host in groups['debian_clients'] %}
|
|
echo "2. **{{ host }}** - Applications and services"
|
|
{% endfor %}
|
|
echo ""
|
|
echo "### Phase 3: Specialized Systems"
|
|
{% for host in groups['hypervisors'] %}
|
|
echo "3. **{{ host }}** - Virtualization and specialized services"
|
|
{% endfor %}
|
|
echo ""
|
|
echo "## Critical Recovery Procedures"
|
|
echo ""
|
|
echo "### 1. Network Recovery"
|
|
echo "- Restore Tailscale mesh connectivity"
|
|
echo "- Verify DNS resolution (AdGuard Home)"
|
|
echo "- Test inter-host communication"
|
|
echo ""
|
|
echo "### 2. Storage Recovery"
|
|
echo "- Mount all required volumes"
|
|
echo "- Verify RAID integrity on Synology systems"
|
|
echo "- Test backup accessibility"
|
|
echo ""
|
|
echo "### 3. Service Recovery"
|
|
echo "- Start Tier 1 services (databases, auth)"
|
|
echo "- Start Tier 2 services (core infrastructure)"
|
|
echo "- Start Tier 3 services (applications)"
|
|
echo "- Start Tier 4 services (optional)"
|
|
echo ""
|
|
echo "## Verification Checklist"
|
|
echo "- [ ] All hosts accessible via Tailscale"
|
|
echo "- [ ] All critical containers running"
|
|
echo "- [ ] Monitoring systems operational"
|
|
echo "- [ ] Backup systems functional"
|
|
echo "- [ ] User services accessible"
|
|
echo ""
|
|
echo "## Emergency Resources"
|
|
echo "- Repository: https://git.vish.gg/Vish/homelab"
|
|
echo "- Ansible Playbooks: /home/homelab/organized/repos/homelab/ansible/automation/"
|
|
echo "- Individual Host Reports: /tmp/disaster_recovery_*.md"
|
|
register: master_plan
|
|
|
|
- name: Save master disaster recovery plan
|
|
copy:
|
|
content: "{{ master_plan.stdout }}"
|
|
dest: "/tmp/master_disaster_recovery_plan_{{ ansible_date_time.epoch }}.md"
|
|
|
|
- name: Display final summary
|
|
debug:
|
|
msg: |
|
|
🚨 Disaster Recovery Orchestration Complete!
|
|
|
|
📋 Generated Reports:
|
|
- Master Plan: /tmp/master_disaster_recovery_plan_{{ ansible_date_time.epoch }}.md
|
|
- Individual Reports: /tmp/disaster_recovery_*.md
|
|
- Recovery Plans: {{ dr_backup_root }}/recovery-plans/ (on Synology hosts)
|
|
|
|
🔧 Next Steps:
|
|
1. Review the master disaster recovery plan
|
|
2. Test recovery procedures in a safe environment
|
|
3. Schedule regular DR drills
|
|
4. Keep recovery documentation updated
|