v0.24.0 — Pre-testing observability: debug logging, diagnostic dump, startup self-test
- Add [DEBUG] logging across all modules (backup, storage, sync, selfupdate, monitor, notify, report, assets, setup) gated behind logging.level: "debug" - Add /api/debug/dump endpoint returning full controller state JSON (debug only) - Add startup self-test validating 9 subsystems (Docker, dirs, storage, hub, restic repos, metrics DB) with pass/warn/fail summary - New packages: internal/selftest, internal/util - Constructor/signature changes: debug bool params, logger params on RunHealthCheck and BuildReport, smart watchdog probe logging Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,6 +2,7 @@ package monitor
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
@@ -22,26 +23,44 @@ type HealthReport struct {
|
||||
}
|
||||
|
||||
// RunHealthCheck runs system checks and returns a diagnostic report.
|
||||
func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, storagePaths []settings.StoragePath) *HealthReport {
|
||||
func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, storagePaths []settings.StoragePath, logger *log.Logger) *HealthReport {
|
||||
report := &HealthReport{
|
||||
Status: "ok",
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
|
||||
debug := cfg.Logging.Level == "debug" && logger != nil
|
||||
|
||||
hddPath := cfg.Paths.HDDPath
|
||||
if len(storagePaths) > 0 {
|
||||
hddPath = storagePaths[0].Path
|
||||
}
|
||||
sysInfo := system.GetInfo(hddPath, cpuCollector)
|
||||
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] [HEALTH] Raw values: disk=%.1f%%, hdd=%.1f%% (configured=%v), mem=%.1f%% (%dMB/%dMB), cpu=%.1f%%, temp=%.1f°C (%s)",
|
||||
sysInfo.DiskPercent, sysInfo.HDDPercent, sysInfo.HDDConfigured,
|
||||
sysInfo.MemPercent, sysInfo.UsedMemMB, sysInfo.TotalMemMB,
|
||||
sysInfo.CPUPercent, sysInfo.TemperatureCelsius, sysInfo.TemperatureSource)
|
||||
}
|
||||
|
||||
// 1. Disk usage (SSD)
|
||||
if sysInfo.DiskPercent > 0 {
|
||||
if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskCritPercent) {
|
||||
report.Issues = append(report.Issues, fmt.Sprintf("SSD disk usage critical: %.0f%%", sysInfo.DiskPercent))
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] [HEALTH] SSD disk: CRITICAL (%.0f%% >= %d%%)", sysInfo.DiskPercent, cfg.Monitoring.Thresholds.DiskCritPercent)
|
||||
}
|
||||
} else if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskWarnPercent) {
|
||||
report.Warnings = append(report.Warnings, fmt.Sprintf("SSD disk usage high: %.0f%%", sysInfo.DiskPercent))
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] [HEALTH] SSD disk: WARN (%.0f%% >= %d%%)", sysInfo.DiskPercent, cfg.Monitoring.Thresholds.DiskWarnPercent)
|
||||
}
|
||||
} else {
|
||||
report.Info = append(report.Info, fmt.Sprintf("SSD: %.0f%% used", sysInfo.DiskPercent))
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] [HEALTH] SSD disk: OK (%.0f%%)", sysInfo.DiskPercent)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,8 +77,14 @@ func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, stora
|
||||
if sysInfo.MemPercent > 0 {
|
||||
if sysInfo.MemPercent >= float64(cfg.Monitoring.Thresholds.MemoryWarnPercent) {
|
||||
report.Warnings = append(report.Warnings, fmt.Sprintf("Memory usage high: %.0f%%", sysInfo.MemPercent))
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] [HEALTH] Memory: WARN (%.0f%% >= %d%%)", sysInfo.MemPercent, cfg.Monitoring.Thresholds.MemoryWarnPercent)
|
||||
}
|
||||
} else {
|
||||
report.Info = append(report.Info, fmt.Sprintf("Memory: %.0f%% used", sysInfo.MemPercent))
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] [HEALTH] Memory: OK (%.0f%%)", sysInfo.MemPercent)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -67,8 +92,14 @@ func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, stora
|
||||
if sysInfo.CPUPercent > 0 {
|
||||
if sysInfo.CPUPercent >= float64(cfg.Monitoring.Thresholds.CPUWarnPercent) {
|
||||
report.Warnings = append(report.Warnings, fmt.Sprintf("CPU usage high: %.0f%%", sysInfo.CPUPercent))
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] [HEALTH] CPU: WARN (%.0f%% >= %d%%)", sysInfo.CPUPercent, cfg.Monitoring.Thresholds.CPUWarnPercent)
|
||||
}
|
||||
} else {
|
||||
report.Info = append(report.Info, fmt.Sprintf("CPU: %.0f%%", sysInfo.CPUPercent))
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] [HEALTH] CPU: OK (%.0f%%)", sysInfo.CPUPercent)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -76,23 +107,45 @@ func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, stora
|
||||
if sysInfo.TemperatureCelsius > 0 {
|
||||
if sysInfo.TemperatureCelsius >= float64(cfg.Monitoring.Thresholds.TemperatureWarnCelsius) {
|
||||
report.Warnings = append(report.Warnings, fmt.Sprintf("Temperature high: %.0f°C (%s)", sysInfo.TemperatureCelsius, sysInfo.TemperatureSource))
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] [HEALTH] Temperature: WARN (%.0f°C >= %d°C)", sysInfo.TemperatureCelsius, cfg.Monitoring.Thresholds.TemperatureWarnCelsius)
|
||||
}
|
||||
} else {
|
||||
report.Info = append(report.Info, fmt.Sprintf("Temperature: %.0f°C", sysInfo.TemperatureCelsius))
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] [HEALTH] Temperature: OK (%.0f°C)", sysInfo.TemperatureCelsius)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Docker health
|
||||
if err := checkDocker(); err != nil {
|
||||
report.Issues = append(report.Issues, fmt.Sprintf("Docker: %v", err))
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] [HEALTH] Docker daemon: FAIL (%v)", err)
|
||||
}
|
||||
} else {
|
||||
report.Info = append(report.Info, "Docker: reachable")
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] [HEALTH] Docker daemon: OK")
|
||||
}
|
||||
}
|
||||
|
||||
// 6. Protected containers
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] [HEALTH] Checking %d protected containers: %v", len(cfg.Stacks.Protected), cfg.Stacks.Protected)
|
||||
}
|
||||
missingProtected := checkProtectedContainers(cfg.Stacks.Protected)
|
||||
for _, name := range missingProtected {
|
||||
report.Issues = append(report.Issues, fmt.Sprintf("Protected container not running: %s", name))
|
||||
}
|
||||
if debug {
|
||||
if len(missingProtected) > 0 {
|
||||
logger.Printf("[DEBUG] [HEALTH] Protected containers missing: %v", missingProtected)
|
||||
} else {
|
||||
logger.Printf("[DEBUG] [HEALTH] All protected containers running")
|
||||
}
|
||||
}
|
||||
|
||||
// 7. Storage paths
|
||||
storageIssues, storageWarnings := checkStoragePaths(storagePaths)
|
||||
@@ -106,6 +159,11 @@ func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, stora
|
||||
report.Status = "warn"
|
||||
}
|
||||
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] [HEALTH] Final status: %s (issues=%d, warnings=%d, info=%d)",
|
||||
report.Status, len(report.Issues), len(report.Warnings), len(report.Info))
|
||||
}
|
||||
|
||||
return report
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user