Files
homelab-optimized/ansible/playbooks/truenas_health.yml
Gitea Mirror Bot e7652c8dab
Some checks failed
Documentation / Build Docusaurus (push) Failing after 5m3s
Documentation / Deploy to GitHub Pages (push) Has been skipped
Sanitized mirror from private repository - 2026-04-20 01:32:01 UTC
2026-04-20 01:32:01 +00:00

203 lines
6.8 KiB
YAML

---
- name: TrueNAS SCALE Health Check
hosts: truenas-scale
gather_facts: yes
become: true
vars:
report_dir: "/tmp/health_reports"
tasks:
# ---------- Report directory ----------
- name: Ensure health report directory exists
ansible.builtin.file:
path: "{{ report_dir }}"
state: directory
mode: '0755'
delegate_to: localhost
run_once: true
# ---------- System overview ----------
- name: TrueNAS version
ansible.builtin.shell: |
if [ -f /etc/version ]; then
cat /etc/version
elif midclt call system.version 2>/dev/null; then
true
else
echo "version unavailable"
fi
register: truenas_version
changed_when: false
failed_when: false
- name: System uptime
ansible.builtin.command: uptime -p
register: uptime_pretty
changed_when: false
failed_when: false
# ---------- ZFS pool health ----------
- name: ZFS pool status (verbose)
ansible.builtin.command: zpool status -v
register: zpool_status
changed_when: false
failed_when: false
- name: ZFS pool list with usage
ansible.builtin.command: zpool list -H
register: zpool_list
changed_when: false
failed_when: false
- name: Count degraded or faulted pools
ansible.builtin.shell: >
zpool status 2>/dev/null
| grep -E "state:\s*(DEGRADED|FAULTED|OFFLINE|REMOVED)"
| wc -l
register: pool_errors
changed_when: false
failed_when: false
- name: Assert all ZFS pools are ONLINE
ansible.builtin.assert:
that:
- pool_errors.stdout | trim | int == 0
success_msg: "All ZFS pools ONLINE"
fail_msg: "DEGRADED or FAULTED pool detected"
ignore_errors: yes
# ---------- ZFS scrub status ----------
- name: ZFS scrub/scan status per pool
ansible.builtin.shell: |
for pool in $(zpool list -H -o name 2>/dev/null); do
echo "Pool: $pool"
zpool status "$pool" 2>/dev/null | grep -E "scrub|scan" | head -3
echo "---"
done
register: zpool_scrub
changed_when: false
failed_when: false
# ---------- Dataset usage ----------
- name: ZFS dataset usage (top-level, up to 20)
ansible.builtin.shell: >
zfs list -H -o name,used,avail,refer,mountpoint -d 1 2>/dev/null | head -20
register: zfs_datasets
changed_when: false
failed_when: false
# ---------- SMART disk status ----------
# Note: empty output here means lsblk returned no physical disks or is unavailable,
# not that no disks exist. The SMART loop below re-runs lsblk independently.
- name: List physical disks
ansible.builtin.shell: >
lsblk -d -o NAME,SIZE,MODEL,SERIAL 2>/dev/null
| grep -v "loop\|sr"
register: disk_list
changed_when: false
failed_when: false
- name: Check SMART health for each disk
ansible.builtin.shell: |
failed=0
results=""
for disk in $(lsblk -d -n -o NAME 2>/dev/null | grep -v "loop\|sr"); do
out=$(smartctl -H /dev/$disk 2>/dev/null | grep -E "SMART overall-health|result:")
if echo "$out" | grep -qi "FAILED"; then
failed=$((failed + 1))
results="$results\n$disk: FAILED ($out)"
else
results="$results\n$disk: ${out:-SMART unavailable}"
fi
done
echo -e "SMART failures: $failed$results"
register: smart_status
changed_when: false
failed_when: false
# ---------- TrueNAS apps (k3s / midclt) ----------
- name: TrueNAS app status
ansible.builtin.shell: |
out=$(k3s kubectl get pods -A --no-headers 2>/dev/null \
| awk '{print $4}' | sort | uniq -c | sort -rn 2>/dev/null)
if [ -n "$out" ]; then
echo "$out"
exit 0
fi
out=$(midclt call chart.release.query 2>/dev/null \
| python3 -c "
import json,sys
try:
data = json.load(sys.stdin)
[print(f'{a.get(\"id\",\"?\"):30} {a.get(\"status\",\"?\")}') for a in data]
except Exception:
pass
" 2>/dev/null)
if [ -n "$out" ]; then
echo "$out"
exit 0
fi
echo "App runtime not detected"
register: app_status
changed_when: false
failed_when: false
# ---------- Summary ----------
- name: TrueNAS health summary
ansible.builtin.debug:
msg: |
============================================================
TrueNAS SCALE Health — {{ inventory_hostname }}
============================================================
Version : {{ truenas_version.stdout | default('unknown') | trim }}
Uptime : {{ uptime_pretty.stdout | default('n/a') | trim }}
--- ZFS Pool Status ---
{{ zpool_status.stdout | default('unavailable') }}
--- ZFS Pool List ---
{{ zpool_list.stdout | default('unavailable') }}
--- Pool Error Count ---
{{ pool_errors.stdout | default('0') | trim }} degraded/faulted/offline/removed pool(s)
--- ZFS Scrub / Scan Status ---
{{ zpool_scrub.stdout | default('unavailable') }}
--- Dataset Usage (top-level) ---
{{ zfs_datasets.stdout | default('unavailable') }}
--- Physical Disks ---
{{ disk_list.stdout | default('unavailable') }}
--- SMART Health ---
{{ smart_status.stdout | default('unavailable') }}
--- App Status ---
{{ app_status.stdout | default('unavailable') }}
============================================================
# ---------- JSON report ----------
- name: Write TrueNAS health JSON report
ansible.builtin.copy:
content: "{{ report_data | to_nice_json }}"
dest: "{{ report_dir }}/truenas_{{ ansible_date_time.date }}.json"
vars:
report_data:
timestamp: "{{ ansible_date_time.iso8601 }}"
host: "{{ inventory_hostname }}"
truenas_version: "{{ truenas_version.stdout | default('unknown') | trim }}"
uptime: "{{ uptime_pretty.stdout | default('n/a') | trim }}"
zpool_status: "{{ zpool_status.stdout | default('') }}"
zpool_list: "{{ zpool_list.stdout | default('') }}"
pool_errors: "{{ pool_errors.stdout | default('0') | trim }}"
zpool_scrub: "{{ zpool_scrub.stdout | default('') }}"
zfs_datasets: "{{ zfs_datasets.stdout | default('') }}"
disk_list: "{{ disk_list.stdout | default('') }}"
smart_status: "{{ smart_status.stdout | default('') }}"
app_status: "{{ app_status.stdout | default('') }}"
delegate_to: localhost
changed_when: false