411 lines
7.8 KiB
Markdown
411 lines
7.8 KiB
Markdown
# 🔧 Maintenance Guide
|
|
|
|
## Overview
|
|
|
|
This guide covers routine maintenance tasks to keep the homelab running smoothly, including updates, cleanup, and health checks.
|
|
|
|
---
|
|
|
|
## 📅 Maintenance Schedule
|
|
|
|
### Daily (Automated)
|
|
- [ ] Database backups
|
|
- [ ] Log rotation
|
|
- [ ] Container health checks
|
|
- [ ] Certificate monitoring
|
|
|
|
### Weekly
|
|
- [ ] Review container updates (Watchtower reports)
|
|
- [ ] Check disk space across all hosts
|
|
- [ ] Review monitoring alerts
|
|
- [ ] Verify backup integrity
|
|
|
|
### Monthly
|
|
- [ ] Apply container updates
|
|
- [ ] DSM/Proxmox security updates
|
|
- [ ] Review and prune unused Docker resources
|
|
- [ ] Test backup restoration
|
|
- [ ] Review access logs for anomalies
|
|
|
|
### Quarterly
|
|
- [ ] Full system health audit
|
|
- [ ] Review and update documentation
|
|
- [ ] Capacity planning review
|
|
- [ ] Security audit
|
|
- [ ] Test disaster recovery procedures
|
|
|
|
---
|
|
|
|
## 🐳 Docker Maintenance
|
|
|
|
### Container Updates
|
|
|
|
```bash
|
|
# Check for available updates
|
|
docker images --format "{{.Repository}}:{{.Tag}}" | while read img; do
|
|
docker pull "$img" 2>/dev/null && echo "Updated: $img"
|
|
done
|
|
|
|
# Or use Watchtower for automated updates
|
|
docker run -d \
|
|
--name watchtower \
|
|
-v /var/run/docker.sock:/var/run/docker.sock \
|
|
containrrr/watchtower \
|
|
--schedule "0 4 * * 0" \ # Sundays at 4 AM
|
|
--cleanup
|
|
```
|
|
|
|
### Prune Unused Resources
|
|
|
|
```bash
|
|
# Remove stopped containers
|
|
docker container prune -f
|
|
|
|
# Remove unused images
|
|
docker image prune -a -f
|
|
|
|
# Remove unused volumes (CAREFUL!)
|
|
docker volume prune -f
|
|
|
|
# Remove unused networks
|
|
docker network prune -f
|
|
|
|
# All-in-one cleanup
|
|
docker system prune -a --volumes -f
|
|
|
|
# Check space recovered
|
|
docker system df
|
|
```
|
|
|
|
### Container Health Checks
|
|
|
|
```bash
|
|
# Check all container statuses
|
|
docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
|
|
|
|
# Find unhealthy containers
|
|
docker ps --filter "health=unhealthy"
|
|
|
|
# Restart unhealthy containers
|
|
docker ps --filter "health=unhealthy" -q | xargs -r docker restart
|
|
|
|
# Check container logs for errors
|
|
for c in $(docker ps -q); do
|
|
echo "=== $(docker inspect --format '{{.Name}}' $c) ==="
|
|
docker logs "$c" --tail 20 2>&1 | grep -i "error\|warn\|fail" || echo "No issues"
|
|
done
|
|
```
|
|
|
|
---
|
|
|
|
## 💾 Storage Maintenance
|
|
|
|
### Disk Space Monitoring
|
|
|
|
```bash
|
|
# Check disk usage on all volumes
|
|
df -h | grep -E "^/dev|volume"
|
|
|
|
# Find large files
|
|
find /volume1/docker -type f -size +1G -exec ls -lh {} \;
|
|
|
|
# Find old log files
|
|
find /volume1 -name "*.log" -mtime +30 -size +100M
|
|
|
|
# Check Docker disk usage
|
|
docker system df -v
|
|
```
|
|
|
|
### Log Management
|
|
|
|
```bash
|
|
# Truncate large container logs
|
|
for log in $(find /var/lib/docker/containers -name "*-json.log" -size +100M); do
|
|
echo "Truncating: $log"
|
|
truncate -s 0 "$log"
|
|
done
|
|
|
|
# Configure log rotation in docker-compose
|
|
services:
|
|
myservice:
|
|
logging:
|
|
driver: "json-file"
|
|
options:
|
|
max-size: "10m"
|
|
max-file: "3"
|
|
```
|
|
|
|
### Database Maintenance
|
|
|
|
```bash
|
|
# PostgreSQL vacuum and analyze
|
|
docker exec postgres psql -U postgres -c "VACUUM ANALYZE;"
|
|
|
|
# PostgreSQL reindex
|
|
docker exec postgres psql -U postgres -c "REINDEX DATABASE postgres;"
|
|
|
|
# Check database size
|
|
docker exec postgres psql -U postgres -c "
|
|
SELECT pg_database.datname,
|
|
pg_size_pretty(pg_database_size(pg_database.datname)) AS size
|
|
FROM pg_database
|
|
ORDER BY pg_database_size(pg_database.datname) DESC;"
|
|
```
|
|
|
|
---
|
|
|
|
## 🖥️ Synology Maintenance
|
|
|
|
### DSM Updates
|
|
|
|
```bash
|
|
# Check for updates via CLI
|
|
synoupgrade --check
|
|
|
|
# Or via DSM UI:
|
|
# Control Panel > Update & Restore > DSM Update
|
|
```
|
|
|
|
### Storage Health
|
|
|
|
```bash
|
|
# Check RAID status
|
|
cat /proc/mdstat
|
|
|
|
# Check disk health
|
|
syno_hdd_util --all
|
|
|
|
# Check for bad sectors
|
|
smartctl -a /dev/sda | grep -E "Reallocated|Current_Pending"
|
|
```
|
|
|
|
### Package Updates
|
|
|
|
```bash
|
|
# List installed packages
|
|
synopkg list --name
|
|
|
|
# Update all packages
|
|
synopkg update_all
|
|
```
|
|
|
|
### Index Optimization
|
|
|
|
```bash
|
|
# Rebuild media index (if slow)
|
|
synoindex -R /volume1/media
|
|
|
|
# Or via DSM:
|
|
# Control Panel > Indexing Service > Re-index
|
|
```
|
|
|
|
---
|
|
|
|
## 🌐 Network Maintenance
|
|
|
|
### DNS Cache
|
|
|
|
```bash
|
|
# Flush Pi-hole DNS cache
|
|
docker exec pihole pihole restartdns
|
|
|
|
# Check DNS resolution
|
|
dig @localhost google.com
|
|
|
|
# Check Pi-hole stats
|
|
docker exec pihole pihole -c -e
|
|
```
|
|
|
|
### Certificate Renewal
|
|
|
|
```bash
|
|
# Check certificate expiry
|
|
echo | openssl s_client -servername example.com -connect example.com:443 2>/dev/null | \
|
|
openssl x509 -noout -dates
|
|
|
|
# Force Let's Encrypt renewal (NPM)
|
|
# Login to NPM UI > SSL Certificates > Renew
|
|
|
|
# Wildcard cert renewal (if using DNS challenge)
|
|
certbot renew --dns-cloudflare
|
|
```
|
|
|
|
### Tailscale Maintenance
|
|
|
|
```bash
|
|
# Check Tailscale status
|
|
tailscale status
|
|
|
|
# Update Tailscale
|
|
tailscale update
|
|
|
|
# Check for connectivity issues
|
|
tailscale netcheck
|
|
```
|
|
|
|
---
|
|
|
|
## 📊 Monitoring Maintenance
|
|
|
|
### Prometheus
|
|
|
|
```bash
|
|
# Check Prometheus targets
|
|
curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets[] | {job: .labels.job, health: .health}'
|
|
|
|
# Clean old data (if needed)
|
|
# Prometheus auto-cleans based on retention settings
|
|
|
|
# Reload configuration
|
|
curl -X POST http://localhost:9090/-/reload
|
|
```
|
|
|
|
### Grafana
|
|
|
|
```bash
|
|
# Backup Grafana dashboards
|
|
docker exec grafana grafana-cli admin data-export /var/lib/grafana/dashboards-backup
|
|
|
|
# Check datasource health
|
|
curl -s http://admin:$GRAFANA_PASSWORD@localhost:3000/api/datasources | jq '.[].name'
|
|
```
|
|
|
|
---
|
|
|
|
## 🔄 Update Procedures
|
|
|
|
### Safe Update Process
|
|
|
|
```bash
|
|
# 1. Check current state
|
|
docker ps -a
|
|
|
|
# 2. Backup critical data
|
|
./backup-script.sh
|
|
|
|
# 3. Pull new images
|
|
docker-compose pull
|
|
|
|
# 4. Stop services gracefully
|
|
docker-compose down
|
|
|
|
# 5. Start updated services
|
|
docker-compose up -d
|
|
|
|
# 6. Verify health
|
|
docker ps
|
|
docker logs <container> --tail 50
|
|
|
|
# 7. Monitor for issues
|
|
# Watch logs for 15-30 minutes
|
|
```
|
|
|
|
### Rollback Procedure
|
|
|
|
```bash
|
|
# If update fails, rollback:
|
|
|
|
# 1. Stop broken containers
|
|
docker-compose down
|
|
|
|
# 2. Find previous image
|
|
docker images | grep <service>
|
|
|
|
# 3. Update docker-compose.yml to use old tag
|
|
# image: service:1.2.3 # Instead of :latest
|
|
|
|
# 4. Restart
|
|
docker-compose up -d
|
|
```
|
|
|
|
---
|
|
|
|
## 🧹 Cleanup Scripts
|
|
|
|
### Weekly Cleanup Script
|
|
|
|
```bash
|
|
#!/bin/bash
|
|
# weekly-cleanup.sh
|
|
|
|
echo "=== Weekly Maintenance $(date) ==="
|
|
|
|
# Docker cleanup
|
|
echo "Cleaning Docker..."
|
|
docker system prune -f
|
|
docker volume prune -f
|
|
|
|
# Log cleanup
|
|
echo "Cleaning logs..."
|
|
find /var/log -name "*.gz" -mtime +30 -delete
|
|
find /volume1/docker -name "*.log" -size +100M -exec truncate -s 0 {} \;
|
|
|
|
# Temp file cleanup
|
|
echo "Cleaning temp files..."
|
|
find /tmp -type f -mtime +7 -delete 2>/dev/null
|
|
|
|
# Report disk space
|
|
echo "Disk space:"
|
|
df -h | grep volume
|
|
|
|
echo "=== Cleanup Complete ==="
|
|
```
|
|
|
|
### Schedule with Cron
|
|
|
|
```bash
|
|
# /etc/crontab
|
|
# Weekly cleanup - Sundays at 3 AM
|
|
0 3 * * 0 root /volume1/scripts/weekly-cleanup.sh >> /var/log/maintenance.log 2>&1
|
|
|
|
# Monthly maintenance - 1st of month at 2 AM
|
|
0 2 1 * * root /volume1/scripts/monthly-maintenance.sh >> /var/log/maintenance.log 2>&1
|
|
```
|
|
|
|
---
|
|
|
|
## 📋 Maintenance Checklist Template
|
|
|
|
```markdown
|
|
## Weekly Maintenance - [DATE]
|
|
|
|
### Pre-Maintenance
|
|
- [ ] Notify family of potential downtime
|
|
- [ ] Check current backups are recent
|
|
- [ ] Review any open issues
|
|
|
|
### Docker
|
|
- [ ] Review Watchtower update report
|
|
- [ ] Check for unhealthy containers
|
|
- [ ] Prune unused resources
|
|
|
|
### Storage
|
|
- [ ] Check disk space (>20% free)
|
|
- [ ] Review large files/logs
|
|
- [ ] Verify RAID health
|
|
|
|
### Network
|
|
- [ ] Check DNS resolution
|
|
- [ ] Verify Tailscale connectivity
|
|
- [ ] Check SSL certificates
|
|
|
|
### Monitoring
|
|
- [ ] Review Prometheus alerts
|
|
- [ ] Check Grafana dashboards
|
|
- [ ] Verify Uptime Kuma status
|
|
|
|
### Post-Maintenance
|
|
- [ ] Document any changes made
|
|
- [ ] Update maintenance log
|
|
- [ ] Test critical services
|
|
```
|
|
|
|
---
|
|
|
|
## 🔗 Related Documentation
|
|
|
|
- [Backup Strategies](backup-strategies.md)
|
|
- [Monitoring Setup](monitoring.md)
|
|
- [Performance Troubleshooting](../troubleshooting/performance.md)
|
|
- [Disaster Recovery](../troubleshooting/disaster-recovery.md)
|