Sanitized mirror from private repository - 2026-03-28 12:26:38 UTC
This commit is contained in:
667
docs/advanced/ansible.md
Normal file
667
docs/advanced/ansible.md
Normal file
@@ -0,0 +1,667 @@
|
||||
# 🤖 Ansible Automation Guide
|
||||
|
||||
**🔴 Advanced Guide**
|
||||
|
||||
This guide covers the Ansible automation system used to manage all 176 services across 13 hosts in this homelab. Ansible enables Infrastructure as Code, automated deployments, and consistent configuration management.
|
||||
|
||||
## 🎯 Ansible in This Homelab
|
||||
|
||||
### 📊 **Current Automation Scope**
|
||||
- **13 hosts** managed through Ansible inventory
|
||||
- **176 services** deployed via playbooks
|
||||
- **Automated health checks** across all systems
|
||||
- **Configuration management** for consistent settings
|
||||
- **Deployment automation** for new services
|
||||
|
||||
### 🏗️ **Architecture Overview**
|
||||
```
|
||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
||||
│ Git Repository│───►│ Ansible Control│───►│ Target Hosts │
|
||||
│ (This repo) │ │ Node │ │ (All systems) │
|
||||
│ │ │ │ │ │
|
||||
│ • Playbooks │ │ • Inventory │ │ • Docker │
|
||||
│ • Inventory │ │ • Execution │ │ • Services │
|
||||
│ • Variables │ │ • Logging │ │ • Configuration │
|
||||
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📁 Repository Structure
|
||||
|
||||
### 🗂️ **Ansible Directory Layout**
|
||||
```
|
||||
ansible/
|
||||
├── automation/
|
||||
│ ├── ansible.cfg # Ansible configuration
|
||||
│ ├── hosts # Main inventory file
|
||||
│ ├── hosts.ini # Alternative inventory format
|
||||
│ ├── group_vars/ # Group-specific variables
|
||||
│ │ ├── all.yml
|
||||
│ │ ├── synology.yml
|
||||
│ │ └── debian_clients.yml
|
||||
│ ├── host_vars/ # Host-specific variables
|
||||
│ │ ├── atlantis.yml
|
||||
│ │ ├── calypso.yml
|
||||
│ │ └── homelab.yml
|
||||
│ ├── playbooks/ # Ansible playbooks
|
||||
│ │ ├── deploy-service.yml
|
||||
│ │ ├── health-check.yml
|
||||
│ │ ├── system-update.yml
|
||||
│ │ └── backup.yml
|
||||
│ └── scripts/ # Helper scripts
|
||||
│ ├── deploy.sh
|
||||
│ └── health-check.sh
|
||||
├── deploy_arr_suite_full.yml # Specific deployment playbooks
|
||||
├── deploy_arr_suite_updated.yml
|
||||
└── inventory.ini # Legacy inventory
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🏠 Inventory Management
|
||||
|
||||
### 📋 **Host Groups**
|
||||
The inventory organizes hosts into logical groups:
|
||||
|
||||
```ini
|
||||
# Core Management Node
|
||||
[homelab]
|
||||
homelab ansible_host=100.67.40.126 ansible_user=homelab
|
||||
|
||||
# Synology NAS Cluster
|
||||
[synology]
|
||||
atlantis ansible_host=100.83.230.112 ansible_port=60000 ansible_user=vish
|
||||
calypso ansible_host=100.103.48.78 ansible_port=62000 ansible_user=Vish
|
||||
setillo ansible_host=100.125.0.20 ansible_user=vish
|
||||
|
||||
# Raspberry Pi Nodes
|
||||
[rpi]
|
||||
pi-5 ansible_host=100.77.151.40 ansible_user=vish
|
||||
pi-5-kevin ansible_host=100.123.246.75 ansible_user=vish
|
||||
|
||||
# Hypervisors / Storage
|
||||
[hypervisors]
|
||||
pve ansible_host=100.87.12.28 ansible_user=root
|
||||
truenas-scale ansible_host=100.75.252.64 ansible_user=vish
|
||||
|
||||
# Remote Systems
|
||||
[remote]
|
||||
vish-concord-nuc ansible_host=100.72.55.21 ansible_user=vish
|
||||
vmi2076105 ansible_host=100.99.156.20 ansible_user=root
|
||||
|
||||
# Active Group (used by most playbooks)
|
||||
[active:children]
|
||||
homelab
|
||||
synology
|
||||
rpi
|
||||
hypervisors
|
||||
remote
|
||||
```
|
||||
|
||||
### 🔧 **Host Variables**
|
||||
Each host has specific configuration:
|
||||
|
||||
```yaml
|
||||
# host_vars/atlantis.yml
|
||||
---
|
||||
# Synology-specific settings
|
||||
synology_user_id: 1026
|
||||
synology_group_id: 100
|
||||
docker_compose_path: /volume1/docker
|
||||
media_path: /volume1/media
|
||||
|
||||
# Service-specific settings
|
||||
plex_enabled: true
|
||||
grafana_enabled: true
|
||||
prometheus_enabled: true
|
||||
|
||||
# Network settings
|
||||
tailscale_ip: 100.83.230.112
|
||||
local_ip: 10.0.0.250
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📖 Playbook Examples
|
||||
|
||||
### 🚀 **Service Deployment Playbook**
|
||||
```yaml
|
||||
---
|
||||
- name: Deploy Docker Service
|
||||
hosts: "{{ target_host | default('all') }}"
|
||||
become: yes
|
||||
vars:
|
||||
service_name: "{{ service_name }}"
|
||||
service_path: "{{ service_path | default('/opt/docker/' + service_name) }}"
|
||||
|
||||
tasks:
|
||||
- name: Create service directory
|
||||
file:
|
||||
path: "{{ service_path }}"
|
||||
state: directory
|
||||
owner: "{{ ansible_user }}"
|
||||
group: "{{ ansible_user }}"
|
||||
mode: '0755'
|
||||
|
||||
- name: Copy docker-compose file
|
||||
template:
|
||||
src: "{{ service_name }}/docker-compose.yml.j2"
|
||||
dest: "{{ service_path }}/docker-compose.yml"
|
||||
owner: "{{ ansible_user }}"
|
||||
group: "{{ ansible_user }}"
|
||||
mode: '0644'
|
||||
notify: restart service
|
||||
|
||||
- name: Copy environment file
|
||||
template:
|
||||
src: "{{ service_name }}/.env.j2"
|
||||
dest: "{{ service_path }}/.env"
|
||||
owner: "{{ ansible_user }}"
|
||||
group: "{{ ansible_user }}"
|
||||
mode: '0600'
|
||||
notify: restart service
|
||||
|
||||
- name: Start service
|
||||
docker_compose:
|
||||
project_src: "{{ service_path }}"
|
||||
state: present
|
||||
pull: yes
|
||||
|
||||
- name: Wait for service to be healthy
|
||||
uri:
|
||||
url: "http://{{ ansible_host }}:{{ service_port }}/health"
|
||||
method: GET
|
||||
status_code: 200
|
||||
retries: 30
|
||||
delay: 10
|
||||
when: service_health_check is defined
|
||||
|
||||
handlers:
|
||||
- name: restart service
|
||||
docker_compose:
|
||||
project_src: "{{ service_path }}"
|
||||
state: present
|
||||
pull: yes
|
||||
recreate: always
|
||||
```
|
||||
|
||||
### 🔍 **Health Check Playbook**
|
||||
```yaml
|
||||
---
|
||||
- name: Health Check All Services
|
||||
hosts: active
|
||||
gather_facts: no
|
||||
|
||||
tasks:
|
||||
- name: Check Docker daemon
|
||||
systemd:
|
||||
name: docker
|
||||
state: started
|
||||
register: docker_status
|
||||
|
||||
- name: Get running containers
|
||||
docker_host_info:
|
||||
containers: yes
|
||||
register: docker_info
|
||||
|
||||
- name: Check container health
|
||||
docker_container_info:
|
||||
name: "{{ item }}"
|
||||
register: container_health
|
||||
loop: "{{ expected_containers | default([]) }}"
|
||||
when: expected_containers is defined
|
||||
|
||||
- name: Test service endpoints
|
||||
uri:
|
||||
url: "http://{{ ansible_host }}:{{ item.port }}{{ item.path | default('/') }}"
|
||||
method: GET
|
||||
timeout: 10
|
||||
register: endpoint_check
|
||||
loop: "{{ service_endpoints | default([]) }}"
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Generate health report
|
||||
template:
|
||||
src: health-report.j2
|
||||
dest: "/tmp/health-{{ inventory_hostname }}-{{ ansible_date_time.epoch }}.json"
|
||||
delegate_to: localhost
|
||||
```
|
||||
|
||||
### 🔄 **System Update Playbook**
|
||||
```yaml
|
||||
---
|
||||
- name: Update Systems and Services
|
||||
hosts: debian_clients
|
||||
become: yes
|
||||
serial: 1 # Update one host at a time
|
||||
|
||||
pre_tasks:
|
||||
- name: Check if reboot required
|
||||
stat:
|
||||
path: /var/run/reboot-required
|
||||
register: reboot_required
|
||||
|
||||
tasks:
|
||||
- name: Update package cache
|
||||
apt:
|
||||
update_cache: yes
|
||||
cache_valid_time: 3600
|
||||
|
||||
- name: Upgrade packages
|
||||
apt:
|
||||
upgrade: dist
|
||||
autoremove: yes
|
||||
autoclean: yes
|
||||
|
||||
- name: Update Docker containers
|
||||
shell: |
|
||||
cd {{ item }}
|
||||
docker-compose pull
|
||||
docker-compose up -d
|
||||
loop: "{{ docker_compose_paths | default([]) }}"
|
||||
when: docker_compose_paths is defined
|
||||
|
||||
- name: Clean up Docker
|
||||
docker_prune:
|
||||
containers: yes
|
||||
images: yes
|
||||
networks: yes
|
||||
volumes: no # Don't remove volumes
|
||||
builder_cache: yes
|
||||
|
||||
post_tasks:
|
||||
- name: Reboot if required
|
||||
reboot:
|
||||
reboot_timeout: 300
|
||||
when: reboot_required.stat.exists
|
||||
|
||||
- name: Wait for services to start
|
||||
wait_for:
|
||||
port: "{{ item }}"
|
||||
timeout: 300
|
||||
loop: "{{ critical_ports | default([22, 80, 443]) }}"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🔧 Configuration Management
|
||||
|
||||
### ⚙️ **Ansible Configuration**
|
||||
```ini
|
||||
# ansible.cfg
|
||||
[defaults]
|
||||
inventory = hosts
|
||||
host_key_checking = False
|
||||
timeout = 30
|
||||
gathering = smart
|
||||
fact_caching = jsonfile
|
||||
fact_caching_connection = /tmp/ansible_facts_cache
|
||||
fact_caching_timeout = 86400
|
||||
|
||||
[ssh_connection]
|
||||
ssh_args = -o ControlMaster=auto -o ControlPersist=60s -o UserKnownHostsFile=/dev/null
|
||||
pipelining = True
|
||||
```
|
||||
|
||||
### 📊 **Group Variables**
|
||||
```yaml
|
||||
# group_vars/all.yml
|
||||
---
|
||||
# Global settings
|
||||
timezone: America/Los_Angeles
|
||||
docker_compose_version: "2.0"
|
||||
default_restart_policy: "on-failure:5"
|
||||
|
||||
# Security settings
|
||||
security_hardening: true
|
||||
no_new_privileges: true
|
||||
default_user_mapping: "1000:1000"
|
||||
|
||||
# Monitoring settings
|
||||
prometheus_enabled: true
|
||||
grafana_enabled: true
|
||||
uptime_kuma_enabled: true
|
||||
|
||||
# Backup settings
|
||||
backup_enabled: true
|
||||
backup_retention_days: 30
|
||||
```
|
||||
|
||||
```yaml
|
||||
# group_vars/synology.yml
|
||||
---
|
||||
# Synology-specific overrides
|
||||
default_user_mapping: "1026:100"
|
||||
docker_compose_path: "/volume1/docker"
|
||||
media_path: "/volume1/media"
|
||||
backup_path: "/volume1/backups"
|
||||
|
||||
# Synology Docker settings
|
||||
docker_socket: "/var/run/docker.sock"
|
||||
docker_data_root: "/volume1/@docker"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Deployment Workflows
|
||||
|
||||
### 📦 **Single Service Deployment**
|
||||
```bash
|
||||
# Deploy a specific service to a specific host
|
||||
ansible-playbook -i hosts playbooks/deploy-service.yml \
|
||||
--extra-vars "target_host=atlantis service_name=uptime-kuma"
|
||||
|
||||
# Deploy to multiple hosts
|
||||
ansible-playbook -i hosts playbooks/deploy-service.yml \
|
||||
--extra-vars "target_host=synology service_name=watchtower"
|
||||
|
||||
# Deploy with custom variables
|
||||
ansible-playbook -i hosts playbooks/deploy-service.yml \
|
||||
--extra-vars "target_host=homelab service_name=grafana grafana_port=3001"
|
||||
```
|
||||
|
||||
### 🏗️ **Full Stack Deployment**
|
||||
```bash
|
||||
# Deploy entire Arr suite to Atlantis
|
||||
ansible-playbook -i hosts deploy_arr_suite_full.yml \
|
||||
--limit atlantis
|
||||
|
||||
# Deploy monitoring stack to all hosts
|
||||
ansible-playbook -i hosts playbooks/deploy-monitoring.yml
|
||||
|
||||
# Deploy with dry-run first
|
||||
ansible-playbook -i hosts playbooks/deploy-service.yml \
|
||||
--check --diff --extra-vars "service_name=new-service"
|
||||
```
|
||||
|
||||
### 🔍 **Health Checks and Monitoring**
|
||||
```bash
|
||||
# Run health checks on all active hosts
|
||||
ansible-playbook -i hosts playbooks/health-check.yml
|
||||
|
||||
# Check specific service group
|
||||
ansible-playbook -i hosts playbooks/health-check.yml \
|
||||
--limit synology
|
||||
|
||||
# Generate detailed health report
|
||||
ansible-playbook -i hosts playbooks/health-check.yml \
|
||||
--extra-vars "detailed_report=true"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 Advanced Automation
|
||||
|
||||
### 🔄 **Automated Updates**
|
||||
```yaml
|
||||
# Cron job for automated updates
|
||||
---
|
||||
- name: Setup Automated Updates
|
||||
hosts: all
|
||||
become: yes
|
||||
|
||||
tasks:
|
||||
- name: Create update script
|
||||
template:
|
||||
src: update-script.sh.j2
|
||||
dest: /usr/local/bin/homelab-update
|
||||
mode: '0755'
|
||||
|
||||
- name: Schedule weekly updates
|
||||
cron:
|
||||
name: "Homelab automated update"
|
||||
minute: "0"
|
||||
hour: "2"
|
||||
weekday: "0" # Sunday
|
||||
job: "/usr/local/bin/homelab-update >> /var/log/homelab-update.log 2>&1"
|
||||
```
|
||||
|
||||
### 📈 **Monitoring Integration**
|
||||
```yaml
|
||||
# Deploy monitoring agents
|
||||
---
|
||||
- name: Deploy Monitoring Stack
|
||||
hosts: all
|
||||
|
||||
tasks:
|
||||
- name: Deploy Node Exporter
|
||||
docker_container:
|
||||
name: node-exporter
|
||||
image: prom/node-exporter:latest
|
||||
ports:
|
||||
- "9100:9100"
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.rootfs=/rootfs'
|
||||
- '--path.sysfs=/host/sys'
|
||||
restart_policy: on-failure
|
||||
|
||||
- name: Register with Prometheus
|
||||
uri:
|
||||
url: "http://{{ prometheus_server }}:9090/api/v1/targets"
|
||||
method: POST
|
||||
body_format: json
|
||||
body:
|
||||
targets:
|
||||
- "{{ ansible_host }}:9100"
|
||||
```
|
||||
|
||||
### 🔐 **Security Automation**
|
||||
```yaml
|
||||
# Security hardening playbook
|
||||
---
|
||||
- name: Security Hardening
|
||||
hosts: all
|
||||
become: yes
|
||||
|
||||
tasks:
|
||||
- name: Update all packages
|
||||
package:
|
||||
name: "*"
|
||||
state: latest
|
||||
|
||||
- name: Configure firewall
|
||||
ufw:
|
||||
rule: allow
|
||||
port: "{{ item }}"
|
||||
loop: "{{ allowed_ports | default([22, 80, 443]) }}"
|
||||
|
||||
- name: Disable root SSH
|
||||
lineinfile:
|
||||
path: /etc/ssh/sshd_config
|
||||
regexp: '^PermitRootLogin'
|
||||
line: 'PermitRootLogin no'
|
||||
notify: restart ssh
|
||||
|
||||
- name: Configure fail2ban
|
||||
package:
|
||||
name: fail2ban
|
||||
state: present
|
||||
|
||||
- name: Harden Docker daemon
|
||||
template:
|
||||
src: docker-daemon.json.j2
|
||||
dest: /etc/docker/daemon.json
|
||||
notify: restart docker
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🔍 Troubleshooting Ansible
|
||||
|
||||
### ❌ **Common Issues**
|
||||
|
||||
#### **SSH Connection Failures**
|
||||
```bash
|
||||
# Test SSH connectivity
|
||||
ansible all -i hosts -m ping
|
||||
|
||||
# Debug SSH issues
|
||||
ansible all -i hosts -m ping -vvv
|
||||
|
||||
# Test with specific user
|
||||
ansible all -i hosts -m ping -u username
|
||||
|
||||
# Check SSH key permissions
|
||||
chmod 600 ~/.ssh/id_rsa
|
||||
```
|
||||
|
||||
#### **Permission Issues**
|
||||
```bash
|
||||
# Test sudo access
|
||||
ansible all -i hosts -m shell -a "sudo whoami" -b
|
||||
|
||||
# Fix sudo configuration
|
||||
ansible all -i hosts -m lineinfile -a "path=/etc/sudoers.d/ansible line='ansible ALL=(ALL) NOPASSWD:ALL'" -b
|
||||
|
||||
# Check user groups
|
||||
ansible all -i hosts -m shell -a "groups"
|
||||
```
|
||||
|
||||
#### **Docker Issues**
|
||||
```bash
|
||||
# Check Docker status
|
||||
ansible all -i hosts -m systemd -a "name=docker state=started" -b
|
||||
|
||||
# Test Docker access
|
||||
ansible all -i hosts -m shell -a "docker ps"
|
||||
|
||||
# Add user to docker group
|
||||
ansible all -i hosts -m user -a "name={{ ansible_user }} groups=docker append=yes" -b
|
||||
```
|
||||
|
||||
### 🔧 **Debugging Techniques**
|
||||
|
||||
#### **Verbose Output**
|
||||
```bash
|
||||
# Increase verbosity
|
||||
ansible-playbook -vvv playbook.yml
|
||||
|
||||
# Debug specific tasks
|
||||
ansible-playbook playbook.yml --start-at-task="Task Name"
|
||||
|
||||
# Check mode (dry run)
|
||||
ansible-playbook playbook.yml --check --diff
|
||||
```
|
||||
|
||||
#### **Fact Gathering**
|
||||
```bash
|
||||
# Gather all facts
|
||||
ansible hostname -i hosts -m setup
|
||||
|
||||
# Gather specific facts
|
||||
ansible hostname -i hosts -m setup -a "filter=ansible_distribution*"
|
||||
|
||||
# Custom fact gathering
|
||||
ansible hostname -i hosts -m shell -a "docker --version"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 Monitoring Ansible
|
||||
|
||||
### 📈 **Execution Tracking**
|
||||
```yaml
|
||||
# Callback plugins for monitoring
|
||||
# ansible.cfg
|
||||
[defaults]
|
||||
callback_plugins = /usr/share/ansible/plugins/callback
|
||||
stdout_callback = json
|
||||
callback_whitelist = timer, profile_tasks, log_plays
|
||||
|
||||
# Log all playbook runs
|
||||
log_path = /var/log/ansible.log
|
||||
```
|
||||
|
||||
### 📊 **Performance Metrics**
|
||||
```bash
|
||||
# Time playbook execution
|
||||
time ansible-playbook playbook.yml
|
||||
|
||||
# Profile task execution
|
||||
ansible-playbook playbook.yml --extra-vars "profile_tasks=true"
|
||||
|
||||
# Monitor resource usage
|
||||
htop # During playbook execution
|
||||
```
|
||||
|
||||
### 🚨 **Error Handling**
|
||||
```yaml
|
||||
# Robust error handling
|
||||
---
|
||||
- name: Deploy with error handling
|
||||
hosts: all
|
||||
ignore_errors: no
|
||||
any_errors_fatal: no
|
||||
|
||||
tasks:
|
||||
- name: Risky task
|
||||
shell: potentially_failing_command
|
||||
register: result
|
||||
failed_when: result.rc != 0 and result.rc != 2 # Allow specific error codes
|
||||
|
||||
- name: Cleanup on failure
|
||||
file:
|
||||
path: /tmp/cleanup
|
||||
state: absent
|
||||
when: result is failed
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🚀 Best Practices
|
||||
|
||||
### ✅ **Playbook Design**
|
||||
- **Idempotency**: Playbooks should be safe to run multiple times
|
||||
- **Error handling**: Always handle potential failures gracefully
|
||||
- **Documentation**: Comment complex tasks and variables
|
||||
- **Testing**: Test playbooks in development before production
|
||||
|
||||
### 🔐 **Security**
|
||||
- **Vault encryption**: Encrypt sensitive variables with ansible-vault
|
||||
- **SSH keys**: Use SSH keys instead of passwords
|
||||
- **Least privilege**: Run tasks with minimum required permissions
|
||||
- **Audit logs**: Keep logs of all Ansible executions
|
||||
|
||||
### 📊 **Performance**
|
||||
- **Parallelism**: Use appropriate fork settings
|
||||
- **Fact caching**: Cache facts to speed up subsequent runs
|
||||
- **Task optimization**: Combine tasks where possible
|
||||
- **Selective execution**: Use tags and limits to run specific parts
|
||||
|
||||
### 🔄 **Maintenance**
|
||||
- **Regular updates**: Keep Ansible and modules updated
|
||||
- **Inventory cleanup**: Remove obsolete hosts and variables
|
||||
- **Playbook refactoring**: Regularly review and improve playbooks
|
||||
- **Documentation**: Keep documentation current with changes
|
||||
|
||||
---
|
||||
|
||||
## 📋 Next Steps
|
||||
|
||||
### 🎯 **Learning Path**
|
||||
1. **Start simple**: Begin with basic playbooks
|
||||
2. **Understand inventory**: Master host and group management
|
||||
3. **Learn templating**: Use Jinja2 for dynamic configurations
|
||||
4. **Explore modules**: Discover Ansible's extensive module library
|
||||
5. **Advanced features**: Roles, collections, and custom modules
|
||||
|
||||
### 📚 **Resources**
|
||||
- **Official docs**: docs.ansible.com
|
||||
- **Ansible Galaxy**: galaxy.ansible.com for roles and collections
|
||||
- **Community**: ansible.com/community
|
||||
- **Training**: Red Hat Ansible training courses
|
||||
|
||||
### 🔗 **Related Documentation**
|
||||
- **[Deployment Guide](../admin/deployment.md)**: Manual deployment processes
|
||||
- **[Infrastructure Overview](../infrastructure/hosts.md)**: Host details and specifications
|
||||
- **[Troubleshooting](../troubleshooting/common-issues.md)**: Common problems and solutions
|
||||
|
||||
---
|
||||
|
||||
*Ansible automation is what makes managing 176 services across 13 hosts feasible. Start with simple playbooks and gradually build more sophisticated automation as your confidence grows.*
|
||||
Reference in New Issue
Block a user