Sanitized mirror from private repository - 2026-04-06 21:14:57 UTC
This commit is contained in:
350
docs/troubleshooting/diagnostics.md
Normal file
350
docs/troubleshooting/diagnostics.md
Normal file
@@ -0,0 +1,350 @@
|
||||
# Diagnostic Tools and Procedures
|
||||
|
||||
This guide covers tools and procedures for diagnosing issues in the homelab infrastructure.
|
||||
|
||||
## Quick Diagnostic Checklist
|
||||
|
||||
### 1. Service Health Check
|
||||
```bash
|
||||
# Check if service is running
|
||||
docker ps | grep service-name
|
||||
|
||||
# Check service logs
|
||||
docker logs service-name --tail 50 -f
|
||||
|
||||
# Check resource usage
|
||||
docker stats service-name
|
||||
```
|
||||
|
||||
### 2. Network Connectivity
|
||||
```bash
|
||||
# Test basic connectivity
|
||||
ping target-host
|
||||
|
||||
# Test specific port
|
||||
telnet target-host port
|
||||
# or
|
||||
nc -zv target-host port
|
||||
|
||||
# Check DNS resolution
|
||||
nslookup domain-name
|
||||
dig domain-name
|
||||
```
|
||||
|
||||
### 3. Storage and Disk Space
|
||||
```bash
|
||||
# Check disk usage
|
||||
df -h
|
||||
|
||||
# Check specific volume usage
|
||||
du -sh /volume1/docker/
|
||||
|
||||
# Check inode usage
|
||||
df -i
|
||||
```
|
||||
|
||||
## Host-Specific Diagnostics
|
||||
|
||||
### Synology NAS (Atlantis/Calypso/Setillo)
|
||||
|
||||
#### System Health
|
||||
```bash
|
||||
# SSH to Synology
|
||||
ssh admin@atlantis.vish.local
|
||||
|
||||
# Check system status
|
||||
syno_poweroff_task -d
|
||||
cat /proc/uptime
|
||||
|
||||
# Check storage health
|
||||
cat /proc/mdstat
|
||||
smartctl -a /dev/sda
|
||||
```
|
||||
|
||||
#### Docker Issues
|
||||
```bash
|
||||
# Check Docker daemon
|
||||
sudo systemctl status docker
|
||||
|
||||
# Check available space for Docker
|
||||
df -h /volume2/@docker
|
||||
|
||||
# Restart Docker daemon (if needed)
|
||||
sudo systemctl restart docker
|
||||
```
|
||||
|
||||
### Proxmox VMs
|
||||
|
||||
#### VM Health Check
|
||||
```bash
|
||||
# On Proxmox host
|
||||
qm list
|
||||
qm status VM-ID
|
||||
|
||||
# Check VM resources
|
||||
qm config VM-ID
|
||||
```
|
||||
|
||||
#### Inside VM Diagnostics
|
||||
```bash
|
||||
# Check system resources
|
||||
htop
|
||||
free -h
|
||||
iostat -x 1
|
||||
|
||||
# Check Docker health
|
||||
docker system df
|
||||
docker system prune --dry-run
|
||||
```
|
||||
|
||||
### Physical Hosts (Anubis/Guava/Concord NUC)
|
||||
|
||||
#### Hardware Diagnostics
|
||||
```bash
|
||||
# Check CPU temperature
|
||||
sensors
|
||||
|
||||
# Check memory
|
||||
free -h
|
||||
cat /proc/meminfo
|
||||
|
||||
# Check disk health
|
||||
smartctl -a /dev/sda
|
||||
```
|
||||
|
||||
## Service-Specific Diagnostics
|
||||
|
||||
### Portainer Issues
|
||||
```bash
|
||||
# Check Portainer logs
|
||||
docker logs portainer
|
||||
|
||||
# Verify API connectivity
|
||||
curl -k https://portainer-host:9443/api/system/status
|
||||
|
||||
# Check endpoint connectivity
|
||||
curl -k https://portainer-host:9443/api/endpoints
|
||||
```
|
||||
|
||||
### Monitoring Stack (Prometheus/Grafana)
|
||||
```bash
|
||||
# Check Prometheus targets
|
||||
curl http://prometheus-host:9090/api/v1/targets
|
||||
|
||||
# Check Grafana health
|
||||
curl http://grafana-host:3000/api/health
|
||||
|
||||
# Verify data source connectivity
|
||||
curl http://grafana-host:3000/api/datasources
|
||||
```
|
||||
|
||||
### Media Stack (Plex/Arr Suite)
|
||||
```bash
|
||||
# Check Plex transcoding
|
||||
tail -f /config/Library/Application\ Support/Plex\ Media\ Server/Logs/Plex\ Media\ Server.log
|
||||
|
||||
# Check arr app logs
|
||||
docker logs sonarr --tail 100
|
||||
docker logs radarr --tail 100
|
||||
|
||||
# Check download client connectivity
|
||||
curl http://sabnzbd-host:8080/api?mode=version
|
||||
```
|
||||
|
||||
## Network Diagnostics
|
||||
|
||||
### Internal Network Issues
|
||||
```bash
|
||||
# Check routing table
|
||||
ip route show
|
||||
|
||||
# Check network interfaces
|
||||
ip addr show
|
||||
|
||||
# Test inter-host connectivity
|
||||
ping -c 4 other-host.local
|
||||
```
|
||||
|
||||
### External Access Issues
|
||||
```bash
|
||||
# Check port forwarding
|
||||
nmap -p PORT external-ip
|
||||
|
||||
# Test from outside network
|
||||
curl -I https://your-domain.com
|
||||
|
||||
# Check DNS propagation
|
||||
dig your-domain.com @8.8.8.8
|
||||
```
|
||||
|
||||
### VPN Diagnostics
|
||||
```bash
|
||||
# Wireguard status
|
||||
wg show
|
||||
|
||||
# Tailscale status
|
||||
tailscale status
|
||||
tailscale ping other-device
|
||||
```
|
||||
|
||||
## Performance Diagnostics
|
||||
|
||||
### System Performance
|
||||
```bash
|
||||
# CPU usage over time
|
||||
sar -u 1 10
|
||||
|
||||
# Memory usage patterns
|
||||
sar -r 1 10
|
||||
|
||||
# Disk I/O patterns
|
||||
iotop -a
|
||||
|
||||
# Network usage
|
||||
iftop
|
||||
```
|
||||
|
||||
### Docker Performance
|
||||
```bash
|
||||
# Container resource usage
|
||||
docker stats --no-stream
|
||||
|
||||
# Check for resource limits
|
||||
docker inspect container-name | grep -A 10 Resources
|
||||
|
||||
# Analyze container logs for errors
|
||||
docker logs container-name 2>&1 | grep -i error
|
||||
```
|
||||
|
||||
## Database Diagnostics
|
||||
|
||||
### PostgreSQL
|
||||
```bash
|
||||
# Connect to database
|
||||
docker exec -it postgres-container psql -U username -d database
|
||||
|
||||
# Check database size
|
||||
SELECT pg_size_pretty(pg_database_size('database_name'));
|
||||
|
||||
# Check active connections
|
||||
SELECT count(*) FROM pg_stat_activity;
|
||||
|
||||
# Check slow queries
|
||||
SELECT query, mean_time, calls FROM pg_stat_statements ORDER BY mean_time DESC LIMIT 10;
|
||||
```
|
||||
|
||||
### Redis
|
||||
```bash
|
||||
# Connect to Redis
|
||||
docker exec -it redis-container redis-cli
|
||||
|
||||
# Check memory usage
|
||||
INFO memory
|
||||
|
||||
# Check connected clients
|
||||
INFO clients
|
||||
|
||||
# Monitor commands
|
||||
MONITOR
|
||||
```
|
||||
|
||||
## Log Analysis
|
||||
|
||||
### Centralized Logging
|
||||
```bash
|
||||
# Search logs with grep
|
||||
grep -r "error" /var/log/
|
||||
|
||||
# Use journalctl for systemd services
|
||||
journalctl -u docker.service -f
|
||||
|
||||
# Analyze Docker logs
|
||||
docker logs --since="1h" container-name | grep ERROR
|
||||
```
|
||||
|
||||
### Log Rotation Issues
|
||||
```bash
|
||||
# Check log sizes
|
||||
find /var/log -name "*.log" -exec ls -lh {} \; | sort -k5 -hr
|
||||
|
||||
# Check logrotate configuration
|
||||
cat /etc/logrotate.conf
|
||||
ls -la /etc/logrotate.d/
|
||||
```
|
||||
|
||||
## Automated Diagnostics
|
||||
|
||||
### Health Check Scripts
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# Basic health check script
|
||||
|
||||
echo "=== System Health Check ==="
|
||||
echo "Uptime: $(uptime)"
|
||||
echo "Disk Usage:"
|
||||
df -h | grep -E "(/$|/volume)"
|
||||
echo "Memory Usage:"
|
||||
free -h
|
||||
echo "Docker Status:"
|
||||
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
|
||||
```
|
||||
|
||||
### Monitoring Integration
|
||||
- Use Grafana dashboards for visual diagnostics
|
||||
- Set up Prometheus alerts for proactive monitoring
|
||||
- Configure ntfy notifications for critical issues
|
||||
|
||||
## Common Diagnostic Scenarios
|
||||
|
||||
### Service Won't Start
|
||||
1. Check Docker daemon status
|
||||
2. Verify compose file syntax
|
||||
3. Check port conflicts
|
||||
4. Verify volume mounts exist
|
||||
5. Check resource availability
|
||||
|
||||
### Slow Performance
|
||||
1. Check CPU/memory usage
|
||||
2. Analyze disk I/O patterns
|
||||
3. Check network latency
|
||||
4. Review container resource limits
|
||||
5. Analyze application logs
|
||||
|
||||
### Network Connectivity Issues
|
||||
1. Test basic ping connectivity
|
||||
2. Check port accessibility
|
||||
3. Verify DNS resolution
|
||||
4. Check firewall rules
|
||||
5. Test VPN connectivity
|
||||
|
||||
### Storage Issues
|
||||
1. Check disk space availability
|
||||
2. Verify mount points
|
||||
3. Check file permissions
|
||||
4. Test disk health with SMART
|
||||
5. Review storage performance
|
||||
|
||||
## Emergency Diagnostic Commands
|
||||
|
||||
Quick commands for emergency situations:
|
||||
|
||||
```bash
|
||||
# System overview
|
||||
htop
|
||||
|
||||
# Network connections
|
||||
ss -tulpn
|
||||
|
||||
# Disk usage by directory
|
||||
du -sh /* | sort -hr
|
||||
|
||||
# Recent system messages
|
||||
dmesg | tail -20
|
||||
|
||||
# Docker system overview
|
||||
docker system df && docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Size}}"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
*For specific service troubleshooting, see individual service documentation in `docs/services/individual/`*
|
||||
Reference in New Issue
Block a user