Sanitized mirror from private repository - 2026-03-21 08:49:32 UTC
This commit is contained in:
410
docs/admin/maintenance.md
Normal file
410
docs/admin/maintenance.md
Normal file
@@ -0,0 +1,410 @@
|
||||
# 🔧 Maintenance Guide
|
||||
|
||||
## Overview
|
||||
|
||||
This guide covers routine maintenance tasks to keep the homelab running smoothly, including updates, cleanup, and health checks.
|
||||
|
||||
---
|
||||
|
||||
## 📅 Maintenance Schedule
|
||||
|
||||
### Daily (Automated)
|
||||
- [ ] Database backups
|
||||
- [ ] Log rotation
|
||||
- [ ] Container health checks
|
||||
- [ ] Certificate monitoring
|
||||
|
||||
### Weekly
|
||||
- [ ] Review container updates (Watchtower reports)
|
||||
- [ ] Check disk space across all hosts
|
||||
- [ ] Review monitoring alerts
|
||||
- [ ] Verify backup integrity
|
||||
|
||||
### Monthly
|
||||
- [ ] Apply container updates
|
||||
- [ ] DSM/Proxmox security updates
|
||||
- [ ] Review and prune unused Docker resources
|
||||
- [ ] Test backup restoration
|
||||
- [ ] Review access logs for anomalies
|
||||
|
||||
### Quarterly
|
||||
- [ ] Full system health audit
|
||||
- [ ] Review and update documentation
|
||||
- [ ] Capacity planning review
|
||||
- [ ] Security audit
|
||||
- [ ] Test disaster recovery procedures
|
||||
|
||||
---
|
||||
|
||||
## 🐳 Docker Maintenance
|
||||
|
||||
### Container Updates
|
||||
|
||||
```bash
|
||||
# Check for available updates
|
||||
docker images --format "{{.Repository}}:{{.Tag}}" | while read img; do
|
||||
docker pull "$img" 2>/dev/null && echo "Updated: $img"
|
||||
done
|
||||
|
||||
# Or use Watchtower for automated updates
|
||||
docker run -d \
|
||||
--name watchtower \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
containrrr/watchtower \
|
||||
--schedule "0 4 * * 0" \ # Sundays at 4 AM
|
||||
--cleanup
|
||||
```
|
||||
|
||||
### Prune Unused Resources
|
||||
|
||||
```bash
|
||||
# Remove stopped containers
|
||||
docker container prune -f
|
||||
|
||||
# Remove unused images
|
||||
docker image prune -a -f
|
||||
|
||||
# Remove unused volumes (CAREFUL!)
|
||||
docker volume prune -f
|
||||
|
||||
# Remove unused networks
|
||||
docker network prune -f
|
||||
|
||||
# All-in-one cleanup
|
||||
docker system prune -a --volumes -f
|
||||
|
||||
# Check space recovered
|
||||
docker system df
|
||||
```
|
||||
|
||||
### Container Health Checks
|
||||
|
||||
```bash
|
||||
# Check all container statuses
|
||||
docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
|
||||
|
||||
# Find unhealthy containers
|
||||
docker ps --filter "health=unhealthy"
|
||||
|
||||
# Restart unhealthy containers
|
||||
docker ps --filter "health=unhealthy" -q | xargs -r docker restart
|
||||
|
||||
# Check container logs for errors
|
||||
for c in $(docker ps -q); do
|
||||
echo "=== $(docker inspect --format '{{.Name}}' $c) ==="
|
||||
docker logs "$c" --tail 20 2>&1 | grep -i "error\|warn\|fail" || echo "No issues"
|
||||
done
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 💾 Storage Maintenance
|
||||
|
||||
### Disk Space Monitoring
|
||||
|
||||
```bash
|
||||
# Check disk usage on all volumes
|
||||
df -h | grep -E "^/dev|volume"
|
||||
|
||||
# Find large files
|
||||
find /volume1/docker -type f -size +1G -exec ls -lh {} \;
|
||||
|
||||
# Find old log files
|
||||
find /volume1 -name "*.log" -mtime +30 -size +100M
|
||||
|
||||
# Check Docker disk usage
|
||||
docker system df -v
|
||||
```
|
||||
|
||||
### Log Management
|
||||
|
||||
```bash
|
||||
# Truncate large container logs
|
||||
for log in $(find /var/lib/docker/containers -name "*-json.log" -size +100M); do
|
||||
echo "Truncating: $log"
|
||||
truncate -s 0 "$log"
|
||||
done
|
||||
|
||||
# Configure log rotation in docker-compose
|
||||
services:
|
||||
myservice:
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "10m"
|
||||
max-file: "3"
|
||||
```
|
||||
|
||||
### Database Maintenance
|
||||
|
||||
```bash
|
||||
# PostgreSQL vacuum and analyze
|
||||
docker exec postgres psql -U postgres -c "VACUUM ANALYZE;"
|
||||
|
||||
# PostgreSQL reindex
|
||||
docker exec postgres psql -U postgres -c "REINDEX DATABASE postgres;"
|
||||
|
||||
# Check database size
|
||||
docker exec postgres psql -U postgres -c "
|
||||
SELECT pg_database.datname,
|
||||
pg_size_pretty(pg_database_size(pg_database.datname)) AS size
|
||||
FROM pg_database
|
||||
ORDER BY pg_database_size(pg_database.datname) DESC;"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🖥️ Synology Maintenance
|
||||
|
||||
### DSM Updates
|
||||
|
||||
```bash
|
||||
# Check for updates via CLI
|
||||
synoupgrade --check
|
||||
|
||||
# Or via DSM UI:
|
||||
# Control Panel > Update & Restore > DSM Update
|
||||
```
|
||||
|
||||
### Storage Health
|
||||
|
||||
```bash
|
||||
# Check RAID status
|
||||
cat /proc/mdstat
|
||||
|
||||
# Check disk health
|
||||
syno_hdd_util --all
|
||||
|
||||
# Check for bad sectors
|
||||
smartctl -a /dev/sda | grep -E "Reallocated|Current_Pending"
|
||||
```
|
||||
|
||||
### Package Updates
|
||||
|
||||
```bash
|
||||
# List installed packages
|
||||
synopkg list --name
|
||||
|
||||
# Update all packages
|
||||
synopkg update_all
|
||||
```
|
||||
|
||||
### Index Optimization
|
||||
|
||||
```bash
|
||||
# Rebuild media index (if slow)
|
||||
synoindex -R /volume1/media
|
||||
|
||||
# Or via DSM:
|
||||
# Control Panel > Indexing Service > Re-index
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🌐 Network Maintenance
|
||||
|
||||
### DNS Cache
|
||||
|
||||
```bash
|
||||
# Flush Pi-hole DNS cache
|
||||
docker exec pihole pihole restartdns
|
||||
|
||||
# Check DNS resolution
|
||||
dig @localhost google.com
|
||||
|
||||
# Check Pi-hole stats
|
||||
docker exec pihole pihole -c -e
|
||||
```
|
||||
|
||||
### Certificate Renewal
|
||||
|
||||
```bash
|
||||
# Check certificate expiry
|
||||
echo | openssl s_client -servername example.com -connect example.com:443 2>/dev/null | \
|
||||
openssl x509 -noout -dates
|
||||
|
||||
# Force Let's Encrypt renewal (NPM)
|
||||
# Login to NPM UI > SSL Certificates > Renew
|
||||
|
||||
# Wildcard cert renewal (if using DNS challenge)
|
||||
certbot renew --dns-cloudflare
|
||||
```
|
||||
|
||||
### Tailscale Maintenance
|
||||
|
||||
```bash
|
||||
# Check Tailscale status
|
||||
tailscale status
|
||||
|
||||
# Update Tailscale
|
||||
tailscale update
|
||||
|
||||
# Check for connectivity issues
|
||||
tailscale netcheck
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📊 Monitoring Maintenance
|
||||
|
||||
### Prometheus
|
||||
|
||||
```bash
|
||||
# Check Prometheus targets
|
||||
curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets[] | {job: .labels.job, health: .health}'
|
||||
|
||||
# Clean old data (if needed)
|
||||
# Prometheus auto-cleans based on retention settings
|
||||
|
||||
# Reload configuration
|
||||
curl -X POST http://localhost:9090/-/reload
|
||||
```
|
||||
|
||||
### Grafana
|
||||
|
||||
```bash
|
||||
# Backup Grafana dashboards
|
||||
docker exec grafana grafana-cli admin data-export /var/lib/grafana/dashboards-backup
|
||||
|
||||
# Check datasource health
|
||||
curl -s http://admin:$GRAFANA_PASSWORD@localhost:3000/api/datasources | jq '.[].name'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🔄 Update Procedures
|
||||
|
||||
### Safe Update Process
|
||||
|
||||
```bash
|
||||
# 1. Check current state
|
||||
docker ps -a
|
||||
|
||||
# 2. Backup critical data
|
||||
./backup-script.sh
|
||||
|
||||
# 3. Pull new images
|
||||
docker-compose pull
|
||||
|
||||
# 4. Stop services gracefully
|
||||
docker-compose down
|
||||
|
||||
# 5. Start updated services
|
||||
docker-compose up -d
|
||||
|
||||
# 6. Verify health
|
||||
docker ps
|
||||
docker logs <container> --tail 50
|
||||
|
||||
# 7. Monitor for issues
|
||||
# Watch logs for 15-30 minutes
|
||||
```
|
||||
|
||||
### Rollback Procedure
|
||||
|
||||
```bash
|
||||
# If update fails, rollback:
|
||||
|
||||
# 1. Stop broken containers
|
||||
docker-compose down
|
||||
|
||||
# 2. Find previous image
|
||||
docker images | grep <service>
|
||||
|
||||
# 3. Update docker-compose.yml to use old tag
|
||||
# image: service:1.2.3 # Instead of :latest
|
||||
|
||||
# 4. Restart
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🧹 Cleanup Scripts
|
||||
|
||||
### Weekly Cleanup Script
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# weekly-cleanup.sh
|
||||
|
||||
echo "=== Weekly Maintenance $(date) ==="
|
||||
|
||||
# Docker cleanup
|
||||
echo "Cleaning Docker..."
|
||||
docker system prune -f
|
||||
docker volume prune -f
|
||||
|
||||
# Log cleanup
|
||||
echo "Cleaning logs..."
|
||||
find /var/log -name "*.gz" -mtime +30 -delete
|
||||
find /volume1/docker -name "*.log" -size +100M -exec truncate -s 0 {} \;
|
||||
|
||||
# Temp file cleanup
|
||||
echo "Cleaning temp files..."
|
||||
find /tmp -type f -mtime +7 -delete 2>/dev/null
|
||||
|
||||
# Report disk space
|
||||
echo "Disk space:"
|
||||
df -h | grep volume
|
||||
|
||||
echo "=== Cleanup Complete ==="
|
||||
```
|
||||
|
||||
### Schedule with Cron
|
||||
|
||||
```bash
|
||||
# /etc/crontab
|
||||
# Weekly cleanup - Sundays at 3 AM
|
||||
0 3 * * 0 root /volume1/scripts/weekly-cleanup.sh >> /var/log/maintenance.log 2>&1
|
||||
|
||||
# Monthly maintenance - 1st of month at 2 AM
|
||||
0 2 1 * * root /volume1/scripts/monthly-maintenance.sh >> /var/log/maintenance.log 2>&1
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 📋 Maintenance Checklist Template
|
||||
|
||||
```markdown
|
||||
## Weekly Maintenance - [DATE]
|
||||
|
||||
### Pre-Maintenance
|
||||
- [ ] Notify family of potential downtime
|
||||
- [ ] Check current backups are recent
|
||||
- [ ] Review any open issues
|
||||
|
||||
### Docker
|
||||
- [ ] Review Watchtower update report
|
||||
- [ ] Check for unhealthy containers
|
||||
- [ ] Prune unused resources
|
||||
|
||||
### Storage
|
||||
- [ ] Check disk space (>20% free)
|
||||
- [ ] Review large files/logs
|
||||
- [ ] Verify RAID health
|
||||
|
||||
### Network
|
||||
- [ ] Check DNS resolution
|
||||
- [ ] Verify Tailscale connectivity
|
||||
- [ ] Check SSL certificates
|
||||
|
||||
### Monitoring
|
||||
- [ ] Review Prometheus alerts
|
||||
- [ ] Check Grafana dashboards
|
||||
- [ ] Verify Uptime Kuma status
|
||||
|
||||
### Post-Maintenance
|
||||
- [ ] Document any changes made
|
||||
- [ ] Update maintenance log
|
||||
- [ ] Test critical services
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🔗 Related Documentation
|
||||
|
||||
- [Backup Strategies](backup-strategies.md)
|
||||
- [Monitoring Setup](monitoring.md)
|
||||
- [Performance Troubleshooting](../troubleshooting/performance.md)
|
||||
- [Disaster Recovery](../troubleshooting/disaster-recovery.md)
|
||||
Reference in New Issue
Block a user