--- - name: TrueNAS SCALE Health Check hosts: truenas-scale gather_facts: yes become: true vars: report_dir: "/tmp/health_reports" tasks: # ---------- Report directory ---------- - name: Ensure health report directory exists ansible.builtin.file: path: "{{ report_dir }}" state: directory mode: '0755' delegate_to: localhost run_once: true # ---------- System overview ---------- - name: TrueNAS version ansible.builtin.shell: | if [ -f /etc/version ]; then cat /etc/version elif midclt call system.version 2>/dev/null; then true else echo "version unavailable" fi register: truenas_version changed_when: false failed_when: false - name: System uptime ansible.builtin.command: uptime -p register: uptime_pretty changed_when: false failed_when: false # ---------- ZFS pool health ---------- - name: ZFS pool status (verbose) ansible.builtin.command: zpool status -v register: zpool_status changed_when: false failed_when: false - name: ZFS pool list with usage ansible.builtin.command: zpool list -H register: zpool_list changed_when: false failed_when: false - name: Count degraded or faulted pools ansible.builtin.shell: > zpool status 2>/dev/null | grep -E "state:\s*(DEGRADED|FAULTED|OFFLINE|REMOVED)" | wc -l register: pool_errors changed_when: false failed_when: false - name: Assert all ZFS pools are ONLINE ansible.builtin.assert: that: - pool_errors.stdout | trim | int == 0 success_msg: "All ZFS pools ONLINE" fail_msg: "DEGRADED or FAULTED pool detected" ignore_errors: yes # ---------- ZFS scrub status ---------- - name: ZFS scrub/scan status per pool ansible.builtin.shell: | for pool in $(zpool list -H -o name 2>/dev/null); do echo "Pool: $pool" zpool status "$pool" 2>/dev/null | grep -E "scrub|scan" | head -3 echo "---" done register: zpool_scrub changed_when: false failed_when: false # ---------- Dataset usage ---------- - name: ZFS dataset usage (top-level, up to 20) ansible.builtin.shell: > zfs list -H -o name,used,avail,refer,mountpoint -d 1 2>/dev/null | head -20 register: zfs_datasets changed_when: false failed_when: false # ---------- SMART disk status ---------- # Note: empty output here means lsblk returned no physical disks or is unavailable, # not that no disks exist. The SMART loop below re-runs lsblk independently. - name: List physical disks ansible.builtin.shell: > lsblk -d -o NAME,SIZE,MODEL,SERIAL 2>/dev/null | grep -v "loop\|sr" register: disk_list changed_when: false failed_when: false - name: Check SMART health for each disk ansible.builtin.shell: | failed=0 results="" for disk in $(lsblk -d -n -o NAME 2>/dev/null | grep -v "loop\|sr"); do out=$(smartctl -H /dev/$disk 2>/dev/null | grep -E "SMART overall-health|result:") if echo "$out" | grep -qi "FAILED"; then failed=$((failed + 1)) results="$results\n$disk: FAILED ($out)" else results="$results\n$disk: ${out:-SMART unavailable}" fi done echo -e "SMART failures: $failed$results" register: smart_status changed_when: false failed_when: false # ---------- TrueNAS apps (k3s / midclt) ---------- - name: TrueNAS app status ansible.builtin.shell: | out=$(k3s kubectl get pods -A --no-headers 2>/dev/null \ | awk '{print $4}' | sort | uniq -c | sort -rn 2>/dev/null) if [ -n "$out" ]; then echo "$out" exit 0 fi out=$(midclt call chart.release.query 2>/dev/null \ | python3 -c " import json,sys try: data = json.load(sys.stdin) [print(f'{a.get(\"id\",\"?\"):30} {a.get(\"status\",\"?\")}') for a in data] except Exception: pass " 2>/dev/null) if [ -n "$out" ]; then echo "$out" exit 0 fi echo "App runtime not detected" register: app_status changed_when: false failed_when: false # ---------- Summary ---------- - name: TrueNAS health summary ansible.builtin.debug: msg: | ============================================================ TrueNAS SCALE Health — {{ inventory_hostname }} ============================================================ Version : {{ truenas_version.stdout | default('unknown') | trim }} Uptime : {{ uptime_pretty.stdout | default('n/a') | trim }} --- ZFS Pool Status --- {{ zpool_status.stdout | default('unavailable') }} --- ZFS Pool List --- {{ zpool_list.stdout | default('unavailable') }} --- Pool Error Count --- {{ pool_errors.stdout | default('0') | trim }} degraded/faulted/offline/removed pool(s) --- ZFS Scrub / Scan Status --- {{ zpool_scrub.stdout | default('unavailable') }} --- Dataset Usage (top-level) --- {{ zfs_datasets.stdout | default('unavailable') }} --- Physical Disks --- {{ disk_list.stdout | default('unavailable') }} --- SMART Health --- {{ smart_status.stdout | default('unavailable') }} --- App Status --- {{ app_status.stdout | default('unavailable') }} ============================================================ # ---------- JSON report ---------- - name: Write TrueNAS health JSON report ansible.builtin.copy: content: "{{ report_data | to_nice_json }}" dest: "{{ report_dir }}/truenas_{{ ansible_date_time.date }}.json" vars: report_data: timestamp: "{{ ansible_date_time.iso8601 }}" host: "{{ inventory_hostname }}" truenas_version: "{{ truenas_version.stdout | default('unknown') | trim }}" uptime: "{{ uptime_pretty.stdout | default('n/a') | trim }}" zpool_status: "{{ zpool_status.stdout | default('') }}" zpool_list: "{{ zpool_list.stdout | default('') }}" pool_errors: "{{ pool_errors.stdout | default('0') | trim }}" zpool_scrub: "{{ zpool_scrub.stdout | default('') }}" zfs_datasets: "{{ zfs_datasets.stdout | default('') }}" disk_list: "{{ disk_list.stdout | default('') }}" smart_status: "{{ smart_status.stdout | default('') }}" app_status: "{{ app_status.stdout | default('') }}" delegate_to: localhost changed_when: false