--- - name: System Monitoring and Metrics Collection hosts: all gather_facts: yes vars: monitoring_timestamp: "{{ ansible_date_time.iso8601 }}" metrics_retention_days: 30 tasks: - name: Create monitoring data directory file: path: "/tmp/monitoring_data" state: directory mode: '0755' delegate_to: localhost run_once: true - name: Collect system metrics shell: | echo "=== SYSTEM METRICS ===" echo "Timestamp: $(date -Iseconds)" echo "Hostname: $(hostname)" echo "Uptime: $(uptime -p)" echo "Load: $(uptime | awk -F'load average:' '{print $2}')" echo "" echo "=== CPU INFORMATION ===" echo "CPU Model: $(lscpu | grep 'Model name' | cut -d':' -f2 | xargs)" echo "CPU Cores: $(nproc)" echo "CPU Usage: $(top -bn1 | grep 'Cpu(s)' | awk '{print $2}' | cut -d'%' -f1)%" echo "" echo "=== MEMORY INFORMATION ===" free -h echo "" echo "=== DISK USAGE ===" df -h echo "" echo "=== NETWORK INTERFACES ===" ip -brief addr show echo "" echo "=== PROCESS SUMMARY ===" ps aux --sort=-%cpu | head -10 echo "" echo "=== SYSTEM TEMPERATURES (if available) ===" if command -v sensors >/dev/null 2>&1; then sensors 2>/dev/null || echo "Temperature sensors not available" else echo "lm-sensors not installed" fi register: system_metrics changed_when: false - name: Collect Docker metrics (if available) shell: | if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then echo "=== DOCKER METRICS ===" echo "Docker Version: $(docker --version)" echo "Containers Running: $(docker ps -q | wc -l)" echo "Containers Total: $(docker ps -aq | wc -l)" echo "Images: $(docker images -q | wc -l)" echo "Volumes: $(docker volume ls -q | wc -l)" echo "" echo "=== CONTAINER RESOURCE USAGE ===" docker stats --no-stream --format "table {{.Container}}\t{{.CPUPerc}}\t{{.MemUsage}}\t{{.NetIO}}\t{{.BlockIO}}" 2>/dev/null || echo "No running containers" echo "" echo "=== DOCKER SYSTEM INFO ===" docker system df 2>/dev/null || echo "Docker system info not available" else echo "Docker not available or not accessible" fi register: docker_metrics changed_when: false ignore_errors: yes - name: Collect network metrics shell: | echo "=== NETWORK METRICS ===" echo "Active Connections:" netstat -tuln 2>/dev/null | head -20 || ss -tuln | head -20 echo "" echo "=== TAILSCALE STATUS ===" if command -v tailscale >/dev/null 2>&1; then tailscale status 2>/dev/null || echo "Tailscale not accessible" else echo "Tailscale not installed" fi echo "" echo "=== INTERNET CONNECTIVITY ===" ping -c 3 8.8.8.8 2>/dev/null | tail -2 || echo "Internet connectivity test failed" register: network_metrics changed_when: false ignore_errors: yes - name: Collect service metrics shell: | echo "=== SERVICE METRICS ===" if command -v systemctl >/dev/null 2>&1; then echo "Failed Services:" systemctl --failed --no-legend 2>/dev/null || echo "No failed services" echo "" echo "Active Services (sample):" systemctl list-units --type=service --state=active --no-legend | head -10 else echo "Systemd not available" fi echo "" echo "=== LOG SUMMARY ===" if [ -f /var/log/syslog ]; then echo "Recent system log entries:" tail -5 /var/log/syslog 2>/dev/null || echo "Cannot access syslog" elif command -v journalctl >/dev/null 2>&1; then echo "Recent journal entries:" journalctl --no-pager -n 5 2>/dev/null || echo "Cannot access journal" else echo "No accessible system logs" fi register: service_metrics changed_when: false ignore_errors: yes - name: Calculate performance metrics set_fact: performance_metrics: cpu_usage: "{{ (system_metrics.stdout | regex_search('CPU Usage: ([0-9.]+)%', '\\1'))[0] | default('0') | float }}" memory_total: "{{ ansible_memtotal_mb }}" memory_used: "{{ ansible_memtotal_mb - ansible_memfree_mb }}" memory_percent: "{{ ((ansible_memtotal_mb - ansible_memfree_mb) / ansible_memtotal_mb * 100) | round(1) }}" disk_usage: "{{ ansible_mounts | selectattr('mount', 'equalto', '/') | map(attribute='size_total') | first | default(0) }}" uptime_seconds: "{{ ansible_uptime_seconds }}" - name: Display monitoring summary debug: msg: | ========================================== 📊 MONITORING REPORT - {{ inventory_hostname }} ========================================== 🖥️ PERFORMANCE SUMMARY: - CPU Usage: {{ performance_metrics.cpu_usage }}% - Memory: {{ performance_metrics.memory_percent }}% ({{ performance_metrics.memory_used }}MB/{{ performance_metrics.memory_total }}MB) - Uptime: {{ performance_metrics.uptime_seconds | int // 86400 }} days, {{ (performance_metrics.uptime_seconds | int % 86400) // 3600 }} hours 📈 DETAILED METRICS: {{ system_metrics.stdout }} 🐳 DOCKER METRICS: {{ docker_metrics.stdout }} 🌐 NETWORK METRICS: {{ network_metrics.stdout }} 🔧 SERVICE METRICS: {{ service_metrics.stdout }} ========================================== - name: Generate comprehensive monitoring report copy: content: | { "timestamp": "{{ monitoring_timestamp }}", "hostname": "{{ inventory_hostname }}", "system_info": { "os": "{{ ansible_distribution }} {{ ansible_distribution_version }}", "kernel": "{{ ansible_kernel }}", "architecture": "{{ ansible_architecture }}", "cpu_cores": {{ ansible_processor_vcpus }}, "memory_mb": {{ ansible_memtotal_mb }} }, "performance": { "cpu_usage_percent": {{ performance_metrics.cpu_usage }}, "memory_usage_percent": {{ performance_metrics.memory_percent }}, "memory_used_mb": {{ performance_metrics.memory_used }}, "memory_total_mb": {{ performance_metrics.memory_total }}, "uptime_seconds": {{ performance_metrics.uptime_seconds }}, "uptime_days": {{ performance_metrics.uptime_seconds | int // 86400 }} }, "raw_metrics": { "system": {{ system_metrics.stdout | to_json }}, "docker": {{ docker_metrics.stdout | to_json }}, "network": {{ network_metrics.stdout | to_json }}, "services": {{ service_metrics.stdout | to_json }} } } dest: "/tmp/monitoring_data/{{ inventory_hostname }}_metrics_{{ ansible_date_time.epoch }}.json" delegate_to: localhost - name: Create monitoring trend data shell: | echo "{{ monitoring_timestamp }},{{ inventory_hostname }},{{ performance_metrics.cpu_usage }},{{ performance_metrics.memory_percent }},{{ performance_metrics.uptime_seconds }}" >> /tmp/monitoring_data/trends.csv delegate_to: localhost ignore_errors: yes - name: Clean old monitoring data shell: | find /tmp/monitoring_data -name "*.json" -mtime +{{ metrics_retention_days }} -delete 2>/dev/null || true delegate_to: localhost run_once: true ignore_errors: yes - name: Summary message debug: msg: | 📊 Monitoring complete for {{ inventory_hostname }} 📄 Report saved to: /tmp/monitoring_data/{{ inventory_hostname }}_metrics_{{ ansible_date_time.epoch }}.json 📈 Trend data updated in: /tmp/monitoring_data/trends.csv Performance Summary: - CPU: {{ performance_metrics.cpu_usage }}% - Memory: {{ performance_metrics.memory_percent }}% - Uptime: {{ performance_metrics.uptime_seconds | int // 86400 }} days