Sanitized mirror from private repository - 2026-03-18 10:31:50 UTC
This commit is contained in:
202
ansible/playbooks/truenas_health.yml
Normal file
202
ansible/playbooks/truenas_health.yml
Normal file
@@ -0,0 +1,202 @@
|
||||
---
|
||||
- name: TrueNAS SCALE Health Check
|
||||
hosts: truenas-scale
|
||||
gather_facts: yes
|
||||
become: true
|
||||
|
||||
vars:
|
||||
report_dir: "/tmp/health_reports"
|
||||
|
||||
tasks:
|
||||
|
||||
# ---------- Report directory ----------
|
||||
- name: Ensure health report directory exists
|
||||
ansible.builtin.file:
|
||||
path: "{{ report_dir }}"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
delegate_to: localhost
|
||||
run_once: true
|
||||
|
||||
# ---------- System overview ----------
|
||||
- name: TrueNAS version
|
||||
ansible.builtin.shell: |
|
||||
if [ -f /etc/version ]; then
|
||||
cat /etc/version
|
||||
elif midclt call system.version 2>/dev/null; then
|
||||
true
|
||||
else
|
||||
echo "version unavailable"
|
||||
fi
|
||||
register: truenas_version
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: System uptime
|
||||
ansible.builtin.command: uptime -p
|
||||
register: uptime_pretty
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
# ---------- ZFS pool health ----------
|
||||
- name: ZFS pool status (verbose)
|
||||
ansible.builtin.command: zpool status -v
|
||||
register: zpool_status
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: ZFS pool list with usage
|
||||
ansible.builtin.command: zpool list -H
|
||||
register: zpool_list
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Count degraded or faulted pools
|
||||
ansible.builtin.shell: >
|
||||
zpool status 2>/dev/null
|
||||
| grep -E "state:\s*(DEGRADED|FAULTED|OFFLINE|REMOVED)"
|
||||
| wc -l
|
||||
register: pool_errors
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Assert all ZFS pools are ONLINE
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- pool_errors.stdout | trim | int == 0
|
||||
success_msg: "All ZFS pools ONLINE"
|
||||
fail_msg: "DEGRADED or FAULTED pool detected"
|
||||
ignore_errors: yes
|
||||
|
||||
# ---------- ZFS scrub status ----------
|
||||
- name: ZFS scrub/scan status per pool
|
||||
ansible.builtin.shell: |
|
||||
for pool in $(zpool list -H -o name 2>/dev/null); do
|
||||
echo "Pool: $pool"
|
||||
zpool status "$pool" 2>/dev/null | grep -E "scrub|scan" | head -3
|
||||
echo "---"
|
||||
done
|
||||
register: zpool_scrub
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
# ---------- Dataset usage ----------
|
||||
- name: ZFS dataset usage (top-level, up to 20)
|
||||
ansible.builtin.shell: >
|
||||
zfs list -H -o name,used,avail,refer,mountpoint -d 1 2>/dev/null | head -20
|
||||
register: zfs_datasets
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
# ---------- SMART disk status ----------
|
||||
# Note: empty output here means lsblk returned no physical disks or is unavailable,
|
||||
# not that no disks exist. The SMART loop below re-runs lsblk independently.
|
||||
- name: List physical disks
|
||||
ansible.builtin.shell: >
|
||||
lsblk -d -o NAME,SIZE,MODEL,SERIAL 2>/dev/null
|
||||
| grep -v "loop\|sr"
|
||||
register: disk_list
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Check SMART health for each disk
|
||||
ansible.builtin.shell: |
|
||||
failed=0
|
||||
results=""
|
||||
for disk in $(lsblk -d -n -o NAME 2>/dev/null | grep -v "loop\|sr"); do
|
||||
out=$(smartctl -H /dev/$disk 2>/dev/null | grep -E "SMART overall-health|result:")
|
||||
if echo "$out" | grep -qi "FAILED"; then
|
||||
failed=$((failed + 1))
|
||||
results="$results\n$disk: FAILED ($out)"
|
||||
else
|
||||
results="$results\n$disk: ${out:-SMART unavailable}"
|
||||
fi
|
||||
done
|
||||
echo -e "SMART failures: $failed$results"
|
||||
register: smart_status
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
# ---------- TrueNAS apps (k3s / midclt) ----------
|
||||
- name: TrueNAS app status
|
||||
ansible.builtin.shell: |
|
||||
out=$(k3s kubectl get pods -A --no-headers 2>/dev/null \
|
||||
| awk '{print $4}' | sort | uniq -c | sort -rn 2>/dev/null)
|
||||
if [ -n "$out" ]; then
|
||||
echo "$out"
|
||||
exit 0
|
||||
fi
|
||||
out=$(midclt call chart.release.query 2>/dev/null \
|
||||
| python3 -c "
|
||||
import json,sys
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
[print(f'{a.get(\"id\",\"?\"):30} {a.get(\"status\",\"?\")}') for a in data]
|
||||
except Exception:
|
||||
pass
|
||||
" 2>/dev/null)
|
||||
if [ -n "$out" ]; then
|
||||
echo "$out"
|
||||
exit 0
|
||||
fi
|
||||
echo "App runtime not detected"
|
||||
register: app_status
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
# ---------- Summary ----------
|
||||
- name: TrueNAS health summary
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
============================================================
|
||||
TrueNAS SCALE Health — {{ inventory_hostname }}
|
||||
============================================================
|
||||
Version : {{ truenas_version.stdout | default('unknown') | trim }}
|
||||
Uptime : {{ uptime_pretty.stdout | default('n/a') | trim }}
|
||||
|
||||
--- ZFS Pool Status ---
|
||||
{{ zpool_status.stdout | default('unavailable') }}
|
||||
|
||||
--- ZFS Pool List ---
|
||||
{{ zpool_list.stdout | default('unavailable') }}
|
||||
|
||||
--- Pool Error Count ---
|
||||
{{ pool_errors.stdout | default('0') | trim }} degraded/faulted/offline/removed pool(s)
|
||||
|
||||
--- ZFS Scrub / Scan Status ---
|
||||
{{ zpool_scrub.stdout | default('unavailable') }}
|
||||
|
||||
--- Dataset Usage (top-level) ---
|
||||
{{ zfs_datasets.stdout | default('unavailable') }}
|
||||
|
||||
--- Physical Disks ---
|
||||
{{ disk_list.stdout | default('unavailable') }}
|
||||
|
||||
--- SMART Health ---
|
||||
{{ smart_status.stdout | default('unavailable') }}
|
||||
|
||||
--- App Status ---
|
||||
{{ app_status.stdout | default('unavailable') }}
|
||||
============================================================
|
||||
|
||||
# ---------- JSON report ----------
|
||||
- name: Write TrueNAS health JSON report
|
||||
ansible.builtin.copy:
|
||||
content: "{{ report_data | to_nice_json }}"
|
||||
dest: "{{ report_dir }}/truenas_{{ ansible_date_time.date }}.json"
|
||||
vars:
|
||||
report_data:
|
||||
timestamp: "{{ ansible_date_time.iso8601 }}"
|
||||
host: "{{ inventory_hostname }}"
|
||||
truenas_version: "{{ truenas_version.stdout | default('unknown') | trim }}"
|
||||
uptime: "{{ uptime_pretty.stdout | default('n/a') | trim }}"
|
||||
zpool_status: "{{ zpool_status.stdout | default('') }}"
|
||||
zpool_list: "{{ zpool_list.stdout | default('') }}"
|
||||
pool_errors: "{{ pool_errors.stdout | default('0') | trim }}"
|
||||
zpool_scrub: "{{ zpool_scrub.stdout | default('') }}"
|
||||
zfs_datasets: "{{ zfs_datasets.stdout | default('') }}"
|
||||
disk_list: "{{ disk_list.stdout | default('') }}"
|
||||
smart_status: "{{ smart_status.stdout | default('') }}"
|
||||
app_status: "{{ app_status.stdout | default('') }}"
|
||||
delegate_to: localhost
|
||||
changed_when: false
|
||||
Reference in New Issue
Block a user