203 lines
6.8 KiB
YAML
203 lines
6.8 KiB
YAML
---
|
|
- name: TrueNAS SCALE Health Check
|
|
hosts: truenas-scale
|
|
gather_facts: yes
|
|
become: true
|
|
|
|
vars:
|
|
report_dir: "/tmp/health_reports"
|
|
|
|
tasks:
|
|
|
|
# ---------- Report directory ----------
|
|
- name: Ensure health report directory exists
|
|
ansible.builtin.file:
|
|
path: "{{ report_dir }}"
|
|
state: directory
|
|
mode: '0755'
|
|
delegate_to: localhost
|
|
run_once: true
|
|
|
|
# ---------- System overview ----------
|
|
- name: TrueNAS version
|
|
ansible.builtin.shell: |
|
|
if [ -f /etc/version ]; then
|
|
cat /etc/version
|
|
elif midclt call system.version 2>/dev/null; then
|
|
true
|
|
else
|
|
echo "version unavailable"
|
|
fi
|
|
register: truenas_version
|
|
changed_when: false
|
|
failed_when: false
|
|
|
|
- name: System uptime
|
|
ansible.builtin.command: uptime -p
|
|
register: uptime_pretty
|
|
changed_when: false
|
|
failed_when: false
|
|
|
|
# ---------- ZFS pool health ----------
|
|
- name: ZFS pool status (verbose)
|
|
ansible.builtin.command: zpool status -v
|
|
register: zpool_status
|
|
changed_when: false
|
|
failed_when: false
|
|
|
|
- name: ZFS pool list with usage
|
|
ansible.builtin.command: zpool list -H
|
|
register: zpool_list
|
|
changed_when: false
|
|
failed_when: false
|
|
|
|
- name: Count degraded or faulted pools
|
|
ansible.builtin.shell: >
|
|
zpool status 2>/dev/null
|
|
| grep -E "state:\s*(DEGRADED|FAULTED|OFFLINE|REMOVED)"
|
|
| wc -l
|
|
register: pool_errors
|
|
changed_when: false
|
|
failed_when: false
|
|
|
|
- name: Assert all ZFS pools are ONLINE
|
|
ansible.builtin.assert:
|
|
that:
|
|
- pool_errors.stdout | trim | int == 0
|
|
success_msg: "All ZFS pools ONLINE"
|
|
fail_msg: "DEGRADED or FAULTED pool detected"
|
|
ignore_errors: yes
|
|
|
|
# ---------- ZFS scrub status ----------
|
|
- name: ZFS scrub/scan status per pool
|
|
ansible.builtin.shell: |
|
|
for pool in $(zpool list -H -o name 2>/dev/null); do
|
|
echo "Pool: $pool"
|
|
zpool status "$pool" 2>/dev/null | grep -E "scrub|scan" | head -3
|
|
echo "---"
|
|
done
|
|
register: zpool_scrub
|
|
changed_when: false
|
|
failed_when: false
|
|
|
|
# ---------- Dataset usage ----------
|
|
- name: ZFS dataset usage (top-level, up to 20)
|
|
ansible.builtin.shell: >
|
|
zfs list -H -o name,used,avail,refer,mountpoint -d 1 2>/dev/null | head -20
|
|
register: zfs_datasets
|
|
changed_when: false
|
|
failed_when: false
|
|
|
|
# ---------- SMART disk status ----------
|
|
# Note: empty output here means lsblk returned no physical disks or is unavailable,
|
|
# not that no disks exist. The SMART loop below re-runs lsblk independently.
|
|
- name: List physical disks
|
|
ansible.builtin.shell: >
|
|
lsblk -d -o NAME,SIZE,MODEL,SERIAL 2>/dev/null
|
|
| grep -v "loop\|sr"
|
|
register: disk_list
|
|
changed_when: false
|
|
failed_when: false
|
|
|
|
- name: Check SMART health for each disk
|
|
ansible.builtin.shell: |
|
|
failed=0
|
|
results=""
|
|
for disk in $(lsblk -d -n -o NAME 2>/dev/null | grep -v "loop\|sr"); do
|
|
out=$(smartctl -H /dev/$disk 2>/dev/null | grep -E "SMART overall-health|result:")
|
|
if echo "$out" | grep -qi "FAILED"; then
|
|
failed=$((failed + 1))
|
|
results="$results\n$disk: FAILED ($out)"
|
|
else
|
|
results="$results\n$disk: ${out:-SMART unavailable}"
|
|
fi
|
|
done
|
|
echo -e "SMART failures: $failed$results"
|
|
register: smart_status
|
|
changed_when: false
|
|
failed_when: false
|
|
|
|
# ---------- TrueNAS apps (k3s / midclt) ----------
|
|
- name: TrueNAS app status
|
|
ansible.builtin.shell: |
|
|
out=$(k3s kubectl get pods -A --no-headers 2>/dev/null \
|
|
| awk '{print $4}' | sort | uniq -c | sort -rn 2>/dev/null)
|
|
if [ -n "$out" ]; then
|
|
echo "$out"
|
|
exit 0
|
|
fi
|
|
out=$(midclt call chart.release.query 2>/dev/null \
|
|
| python3 -c "
|
|
import json,sys
|
|
try:
|
|
data = json.load(sys.stdin)
|
|
[print(f'{a.get(\"id\",\"?\"):30} {a.get(\"status\",\"?\")}') for a in data]
|
|
except Exception:
|
|
pass
|
|
" 2>/dev/null)
|
|
if [ -n "$out" ]; then
|
|
echo "$out"
|
|
exit 0
|
|
fi
|
|
echo "App runtime not detected"
|
|
register: app_status
|
|
changed_when: false
|
|
failed_when: false
|
|
|
|
# ---------- Summary ----------
|
|
- name: TrueNAS health summary
|
|
ansible.builtin.debug:
|
|
msg: |
|
|
============================================================
|
|
TrueNAS SCALE Health — {{ inventory_hostname }}
|
|
============================================================
|
|
Version : {{ truenas_version.stdout | default('unknown') | trim }}
|
|
Uptime : {{ uptime_pretty.stdout | default('n/a') | trim }}
|
|
|
|
--- ZFS Pool Status ---
|
|
{{ zpool_status.stdout | default('unavailable') }}
|
|
|
|
--- ZFS Pool List ---
|
|
{{ zpool_list.stdout | default('unavailable') }}
|
|
|
|
--- Pool Error Count ---
|
|
{{ pool_errors.stdout | default('0') | trim }} degraded/faulted/offline/removed pool(s)
|
|
|
|
--- ZFS Scrub / Scan Status ---
|
|
{{ zpool_scrub.stdout | default('unavailable') }}
|
|
|
|
--- Dataset Usage (top-level) ---
|
|
{{ zfs_datasets.stdout | default('unavailable') }}
|
|
|
|
--- Physical Disks ---
|
|
{{ disk_list.stdout | default('unavailable') }}
|
|
|
|
--- SMART Health ---
|
|
{{ smart_status.stdout | default('unavailable') }}
|
|
|
|
--- App Status ---
|
|
{{ app_status.stdout | default('unavailable') }}
|
|
============================================================
|
|
|
|
# ---------- JSON report ----------
|
|
- name: Write TrueNAS health JSON report
|
|
ansible.builtin.copy:
|
|
content: "{{ report_data | to_nice_json }}"
|
|
dest: "{{ report_dir }}/truenas_{{ ansible_date_time.date }}.json"
|
|
vars:
|
|
report_data:
|
|
timestamp: "{{ ansible_date_time.iso8601 }}"
|
|
host: "{{ inventory_hostname }}"
|
|
truenas_version: "{{ truenas_version.stdout | default('unknown') | trim }}"
|
|
uptime: "{{ uptime_pretty.stdout | default('n/a') | trim }}"
|
|
zpool_status: "{{ zpool_status.stdout | default('') }}"
|
|
zpool_list: "{{ zpool_list.stdout | default('') }}"
|
|
pool_errors: "{{ pool_errors.stdout | default('0') | trim }}"
|
|
zpool_scrub: "{{ zpool_scrub.stdout | default('') }}"
|
|
zfs_datasets: "{{ zfs_datasets.stdout | default('') }}"
|
|
disk_list: "{{ disk_list.stdout | default('') }}"
|
|
smart_status: "{{ smart_status.stdout | default('') }}"
|
|
app_status: "{{ app_status.stdout | default('') }}"
|
|
delegate_to: localhost
|
|
changed_when: false
|