homelab-optimized/ansible/playbooks/truenas_health.yml

---
- name: TrueNAS SCALE Health Check
  hosts: truenas-scale
  gather_facts: yes
  become: true

  vars:
    report_dir: "/tmp/health_reports"

  tasks:

    # ---------- Report directory ----------
    - name: Ensure health report directory exists
      ansible.builtin.file:
        path: "{{ report_dir }}"
        state: directory
        mode: '0755'
      delegate_to: localhost
      run_once: true

    # ---------- System overview ----------
    - name: TrueNAS version
      ansible.builtin.shell: |
        if [ -f /etc/version ]; then
          cat /etc/version
        elif midclt call system.version 2>/dev/null; then
          true
        else
          echo "version unavailable"
        fi
      register: truenas_version
      changed_when: false
      failed_when: false

    - name: System uptime
      ansible.builtin.command: uptime -p
      register: uptime_pretty
      changed_when: false
      failed_when: false

    # ---------- ZFS pool health ----------
    - name: ZFS pool status (verbose)
      ansible.builtin.command: zpool status -v
      register: zpool_status
      changed_when: false
      failed_when: false

    - name: ZFS pool list with usage
      ansible.builtin.command: zpool list -H
      register: zpool_list
      changed_when: false
      failed_when: false

    - name: Count degraded or faulted pools
      ansible.builtin.shell: >
        zpool status 2>/dev/null
        | grep -E "state:\s*(DEGRADED|FAULTED|OFFLINE|REMOVED)"
        | wc -l
      register: pool_errors
      changed_when: false
      failed_when: false

    - name: Assert all ZFS pools are ONLINE
      ansible.builtin.assert:
        that:
          - pool_errors.stdout | trim | int == 0
        success_msg: "All ZFS pools ONLINE"
        fail_msg: "DEGRADED or FAULTED pool detected"
      ignore_errors: yes

    # ---------- ZFS scrub status ----------
    - name: ZFS scrub/scan status per pool
      ansible.builtin.shell: |
        for pool in $(zpool list -H -o name 2>/dev/null); do
          echo "Pool: $pool"
          zpool status "$pool" 2>/dev/null | grep -E "scrub|scan" | head -3
          echo "---"
        done
      register: zpool_scrub
      changed_when: false
      failed_when: false

    # ---------- Dataset usage ----------
    - name: ZFS dataset usage (top-level, up to 20)
      ansible.builtin.shell: >
        zfs list -H -o name,used,avail,refer,mountpoint -d 1 2>/dev/null | head -20
      register: zfs_datasets
      changed_when: false
      failed_when: false

    # ---------- SMART disk status ----------
    # Note: empty output here means lsblk returned no physical disks or is unavailable,
    # not that no disks exist. The SMART loop below re-runs lsblk independently.
    - name: List physical disks
      ansible.builtin.shell: >
        lsblk -d -o NAME,SIZE,MODEL,SERIAL 2>/dev/null
        | grep -v "loop\|sr"
      register: disk_list
      changed_when: false
      failed_when: false

    - name: Check SMART health for each disk
      ansible.builtin.shell: |
        failed=0
        results=""
        for disk in $(lsblk -d -n -o NAME 2>/dev/null | grep -v "loop\|sr"); do
          out=$(smartctl -H /dev/$disk 2>/dev/null | grep -E "SMART overall-health|result:")
          if echo "$out" | grep -qi "FAILED"; then
            failed=$((failed + 1))
            results="$results\n$disk: FAILED ($out)"
          else
            results="$results\n$disk: ${out:-SMART unavailable}"
          fi
        done
        echo -e "SMART failures: $failed$results"
      register: smart_status
      changed_when: false
      failed_when: false

    # ---------- TrueNAS apps (k3s / midclt) ----------
    - name: TrueNAS app status
      ansible.builtin.shell: |
        out=$(k3s kubectl get pods -A --no-headers 2>/dev/null \
              | awk '{print $4}' | sort | uniq -c | sort -rn 2>/dev/null)
        if [ -n "$out" ]; then
          echo "$out"
          exit 0
        fi
        out=$(midclt call chart.release.query 2>/dev/null \
              | python3 -c "
        import json,sys
        try:
          data = json.load(sys.stdin)
          [print(f'{a.get(\"id\",\"?\"):30} {a.get(\"status\",\"?\")}') for a in data]
        except Exception:
          pass
        " 2>/dev/null)
        if [ -n "$out" ]; then
          echo "$out"
          exit 0
        fi
        echo "App runtime not detected"
      register: app_status
      changed_when: false
      failed_when: false

    # ---------- Summary ----------
    - name: TrueNAS health summary
      ansible.builtin.debug:
        msg: |
          ============================================================
          TrueNAS SCALE Health — {{ inventory_hostname }}
          ============================================================
          Version : {{ truenas_version.stdout | default('unknown') | trim }}
          Uptime  : {{ uptime_pretty.stdout | default('n/a') | trim }}

          --- ZFS Pool Status ---
          {{ zpool_status.stdout | default('unavailable') }}

          --- ZFS Pool List ---
          {{ zpool_list.stdout | default('unavailable') }}

          --- Pool Error Count ---
          {{ pool_errors.stdout | default('0') | trim }} degraded/faulted/offline/removed pool(s)

          --- ZFS Scrub / Scan Status ---
          {{ zpool_scrub.stdout | default('unavailable') }}

          --- Dataset Usage (top-level) ---
          {{ zfs_datasets.stdout | default('unavailable') }}

          --- Physical Disks ---
          {{ disk_list.stdout | default('unavailable') }}

          --- SMART Health ---
          {{ smart_status.stdout | default('unavailable') }}

          --- App Status ---
          {{ app_status.stdout | default('unavailable') }}
          ============================================================

    # ---------- JSON report ----------
    - name: Write TrueNAS health JSON report
      ansible.builtin.copy:
        content: "{{ report_data | to_nice_json }}"
        dest: "{{ report_dir }}/truenas_{{ ansible_date_time.date }}.json"
      vars:
        report_data:
          timestamp: "{{ ansible_date_time.iso8601 }}"
          host: "{{ inventory_hostname }}"
          truenas_version: "{{ truenas_version.stdout | default('unknown') | trim }}"
          uptime: "{{ uptime_pretty.stdout | default('n/a') | trim }}"
          zpool_status: "{{ zpool_status.stdout | default('') }}"
          zpool_list: "{{ zpool_list.stdout | default('') }}"
          pool_errors: "{{ pool_errors.stdout | default('0') | trim }}"
          zpool_scrub: "{{ zpool_scrub.stdout | default('') }}"
          zfs_datasets: "{{ zfs_datasets.stdout | default('') }}"
          disk_list: "{{ disk_list.stdout | default('') }}"
          smart_status: "{{ smart_status.stdout | default('') }}"
          app_status: "{{ app_status.stdout | default('') }}"
      delegate_to: localhost
      changed_when: false