v0.24.0 — Pre-testing observability: debug logging, diagnostic dump, startup self-test

- Add [DEBUG] logging across all modules (backup, storage, sync, selfupdate, monitor, notify, report, assets, setup) gated behind logging.level: "debug" - Add /api/debug/dump endpoint returning full controller state JSON (debug only) - Add startup self-test validating 9 subsystems (Docker, dirs, storage, hub, restic repos, metrics DB) with pass/warn/fail summary - New packages: internal/selftest, internal/util - Constructor/signature changes: debug bool params, logger params on RunHealthCheck and BuildReport, smart watchdog probe logging Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 18:32:26 +01:00
parent 6f02536243
commit be7803c0ac
30 changed files with 1281 additions and 67 deletions
@@ -2,6 +2,7 @@ package monitor

 import (
 	"fmt"
+	"log"
 	"os"
 	"os/exec"
 	"strings"
@@ -22,26 +23,44 @@ type HealthReport struct {
 }

 // RunHealthCheck runs system checks and returns a diagnostic report.
-func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, storagePaths []settings.StoragePath) *HealthReport {
+func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, storagePaths []settings.StoragePath, logger *log.Logger) *HealthReport {
 	report := &HealthReport{
 		Status:    "ok",
 		Timestamp: time.Now(),
 	}

+	debug := cfg.Logging.Level == "debug" && logger != nil
+
 	hddPath := cfg.Paths.HDDPath
 	if len(storagePaths) > 0 {
 		hddPath = storagePaths[0].Path
 	}
 	sysInfo := system.GetInfo(hddPath, cpuCollector)

+	if debug {
+		logger.Printf("[DEBUG] [HEALTH] Raw values: disk=%.1f%%, hdd=%.1f%% (configured=%v), mem=%.1f%% (%dMB/%dMB), cpu=%.1f%%, temp=%.1f°C (%s)",
+			sysInfo.DiskPercent, sysInfo.HDDPercent, sysInfo.HDDConfigured,
+			sysInfo.MemPercent, sysInfo.UsedMemMB, sysInfo.TotalMemMB,
+			sysInfo.CPUPercent, sysInfo.TemperatureCelsius, sysInfo.TemperatureSource)
+	}
+
 	// 1. Disk usage (SSD)
 	if sysInfo.DiskPercent > 0 {
 		if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskCritPercent) {
 			report.Issues = append(report.Issues, fmt.Sprintf("SSD disk usage critical: %.0f%%", sysInfo.DiskPercent))
+			if debug {
+				logger.Printf("[DEBUG] [HEALTH] SSD disk: CRITICAL (%.0f%% >= %d%%)", sysInfo.DiskPercent, cfg.Monitoring.Thresholds.DiskCritPercent)
+			}
 		} else if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskWarnPercent) {
 			report.Warnings = append(report.Warnings, fmt.Sprintf("SSD disk usage high: %.0f%%", sysInfo.DiskPercent))
+			if debug {
+				logger.Printf("[DEBUG] [HEALTH] SSD disk: WARN (%.0f%% >= %d%%)", sysInfo.DiskPercent, cfg.Monitoring.Thresholds.DiskWarnPercent)
+			}
 		} else {
 			report.Info = append(report.Info, fmt.Sprintf("SSD: %.0f%% used", sysInfo.DiskPercent))
+			if debug {
+				logger.Printf("[DEBUG] [HEALTH] SSD disk: OK (%.0f%%)", sysInfo.DiskPercent)
+			}
 		}
 	}

@@ -58,8 +77,14 @@ func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, stora
 	if sysInfo.MemPercent > 0 {
 		if sysInfo.MemPercent >= float64(cfg.Monitoring.Thresholds.MemoryWarnPercent) {
 			report.Warnings = append(report.Warnings, fmt.Sprintf("Memory usage high: %.0f%%", sysInfo.MemPercent))
+			if debug {
+				logger.Printf("[DEBUG] [HEALTH] Memory: WARN (%.0f%% >= %d%%)", sysInfo.MemPercent, cfg.Monitoring.Thresholds.MemoryWarnPercent)
+			}
 		} else {
 			report.Info = append(report.Info, fmt.Sprintf("Memory: %.0f%% used", sysInfo.MemPercent))
+			if debug {
+				logger.Printf("[DEBUG] [HEALTH] Memory: OK (%.0f%%)", sysInfo.MemPercent)
+			}
 		}
 	}

@@ -67,8 +92,14 @@ func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, stora
 	if sysInfo.CPUPercent > 0 {
 		if sysInfo.CPUPercent >= float64(cfg.Monitoring.Thresholds.CPUWarnPercent) {
 			report.Warnings = append(report.Warnings, fmt.Sprintf("CPU usage high: %.0f%%", sysInfo.CPUPercent))
+			if debug {
+				logger.Printf("[DEBUG] [HEALTH] CPU: WARN (%.0f%% >= %d%%)", sysInfo.CPUPercent, cfg.Monitoring.Thresholds.CPUWarnPercent)
+			}
 		} else {
 			report.Info = append(report.Info, fmt.Sprintf("CPU: %.0f%%", sysInfo.CPUPercent))
+			if debug {
+				logger.Printf("[DEBUG] [HEALTH] CPU: OK (%.0f%%)", sysInfo.CPUPercent)
+			}
 		}
 	}

@@ -76,23 +107,45 @@ func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, stora
 	if sysInfo.TemperatureCelsius > 0 {
 		if sysInfo.TemperatureCelsius >= float64(cfg.Monitoring.Thresholds.TemperatureWarnCelsius) {
 			report.Warnings = append(report.Warnings, fmt.Sprintf("Temperature high: %.0f°C (%s)", sysInfo.TemperatureCelsius, sysInfo.TemperatureSource))
+			if debug {
+				logger.Printf("[DEBUG] [HEALTH] Temperature: WARN (%.0f°C >= %d°C)", sysInfo.TemperatureCelsius, cfg.Monitoring.Thresholds.TemperatureWarnCelsius)
+			}
 		} else {
 			report.Info = append(report.Info, fmt.Sprintf("Temperature: %.0f°C", sysInfo.TemperatureCelsius))
+			if debug {
+				logger.Printf("[DEBUG] [HEALTH] Temperature: OK (%.0f°C)", sysInfo.TemperatureCelsius)
+			}
 		}
 	}

 	// 5. Docker health
 	if err := checkDocker(); err != nil {
 		report.Issues = append(report.Issues, fmt.Sprintf("Docker: %v", err))
+		if debug {
+			logger.Printf("[DEBUG] [HEALTH] Docker daemon: FAIL (%v)", err)
+		}
 	} else {
 		report.Info = append(report.Info, "Docker: reachable")
+		if debug {
+			logger.Printf("[DEBUG] [HEALTH] Docker daemon: OK")
+		}
 	}

 	// 6. Protected containers
+	if debug {
+		logger.Printf("[DEBUG] [HEALTH] Checking %d protected containers: %v", len(cfg.Stacks.Protected), cfg.Stacks.Protected)
+	}
 	missingProtected := checkProtectedContainers(cfg.Stacks.Protected)
 	for _, name := range missingProtected {
 		report.Issues = append(report.Issues, fmt.Sprintf("Protected container not running: %s", name))
 	}
+	if debug {
+		if len(missingProtected) > 0 {
+			logger.Printf("[DEBUG] [HEALTH] Protected containers missing: %v", missingProtected)
+		} else {
+			logger.Printf("[DEBUG] [HEALTH] All protected containers running")
+		}
+	}

 	// 7. Storage paths
 	storageIssues, storageWarnings := checkStoragePaths(storagePaths)
@@ -106,6 +159,11 @@ func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, stora
 		report.Status = "warn"
 	}

+	if debug {
+		logger.Printf("[DEBUG] [HEALTH] Final status: %s (issues=%d, warnings=%d, info=%d)",
+			report.Status, len(report.Issues), len(report.Warnings), len(report.Info))
+	}
+
 	return report
 }

@@ -58,6 +58,11 @@ type pathProbeState struct {
 	lastStatus          string // "connected", "disconnected"
 	lastProbeTime       time.Time
 	probeInterval       time.Duration
+	// Debug counters for summary logging
+	probeCount      int
+	probeOKCount    int
+	lastSummaryTime time.Time
+	totalLatency    time.Duration
 }

 // StorageWatchdog monitors registered storage paths and reacts to disconnection/reconnection.
@@ -95,6 +100,9 @@ func NewStorageWatchdog(
 	}
 }

+// isDebug returns true if the logging level is set to "debug".
+func (w *StorageWatchdog) isDebug() bool { return w.cfg.Logging.Level == "debug" }
+
 // SetAlertRefresh sets the callback to trigger alert refresh.
 func (w *StorageWatchdog) SetAlertRefresh(fn func()) {
 	w.alertRefresh = fn
@@ -164,17 +172,45 @@ func (w *StorageWatchdog) getOrCreateState(path string) *pathProbeState {

 // handleConnectedProbe probes a connected drive and triggers disconnect if needed.
 func (w *StorageWatchdog) handleConnectedProbe(sp settings.StoragePath, state *pathProbeState) {
+	probeStart := time.Now()
 	result := system.ProbeStoragePath(sp.Path)
+	probeLatency := time.Since(probeStart)
+
+	if w.isDebug() {
+		state.probeCount++
+		state.totalLatency += probeLatency
+	}
+
 	if result.Status == system.ProbeConnected {
 		if state.consecutiveFailures > 0 {
 			w.logger.Printf("[DEBUG] [STORAGE] Probe recovered for %s after %d failures", sp.Path, state.consecutiveFailures)
 		}
 		state.consecutiveFailures = 0
 		state.lastStatus = "connected"
+		if w.isDebug() {
+			state.probeOKCount++
+			// Every 60 probes (~5 minutes at 5s interval): emit summary
+			if state.probeCount >= 60 {
+				avgLatency := state.totalLatency / time.Duration(state.probeCount)
+				w.logger.Printf("[DEBUG] [STORAGE] Storage watchdog: %s — %d/%d probes OK (last 5m, avg %dms)",
+					sp.Path, state.probeOKCount, state.probeCount, avgLatency.Milliseconds())
+				state.probeCount = 0
+				state.probeOKCount = 0
+				state.totalLatency = 0
+				state.lastSummaryTime = time.Now()
+			}
+		}
 		return
 	}

 	state.consecutiveFailures++
+
+	// Debug: log immediately on unexpected failure (was connected, now failing)
+	if w.isDebug() && state.lastStatus == "connected" {
+		w.logger.Printf("[DEBUG] [STORAGE] Storage probe failed for %s (%d/%d before disconnect): %v",
+			sp.Path, state.consecutiveFailures, probeThreshold, result.Err)
+	}
+
 	w.logger.Printf("[WARN] [STORAGE] Probe failed for %s (%d/%d): %v",
 		sp.Path, state.consecutiveFailures, probeThreshold, result.Err)

@@ -239,6 +275,11 @@ func (w *StorageWatchdog) handleReconnectCheck(ctx context.Context, sp settings.
 		return
 	}

+	if w.isDebug() {
+		w.logger.Printf("[DEBUG] [STORAGE] Reconnect check for %s: UUID=%s, mountPath=%s, isAttachWizard=%v",
+			sp.Path, uuid, mountPath, isAttachWizard)
+	}
+
 	// Check if the UUID block device is present
 	uuidPath := filepath.Join(hostDevUUIDPath, uuid)
 	if _, err := os.Stat(uuidPath); err != nil {
@@ -251,6 +292,11 @@ func (w *StorageWatchdog) handleReconnectCheck(ctx context.Context, sp settings.
 	}
 	w.logger.Printf("[INFO] [STORAGE] Drive reconnected (UUID found), attempting remount: %s (%s)", sp.Path, label)

+	if w.isDebug() {
+		w.logger.Printf("[DEBUG] [STORAGE] UUID %s found at %s, mounting %s (raw=%s, attachWizard=%v)",
+			uuid, uuidPath, sp.Path, rawPath, isAttachWizard)
+	}
+
 	// Attempt remount
 	if err := w.remount(sp.Path, rawPath, isAttachWizard); err != nil {
 		w.logger.Printf("[ERROR] [STORAGE] Remount failed for %s: %v", sp.Path, err)
@@ -261,9 +307,17 @@ func (w *StorageWatchdog) handleReconnectCheck(ctx context.Context, sp settings.
 	verifyResult := system.ProbeStoragePath(sp.Path)
 	if verifyResult.Status != system.ProbeConnected {
 		w.logger.Printf("[ERROR] [STORAGE] Post-remount probe failed for %s: %v", sp.Path, verifyResult.Err)
+		if w.isDebug() {
+			w.logger.Printf("[DEBUG] [STORAGE] Post-mount verification failed for %s: status=%v, err=%v",
+				sp.Path, verifyResult.Status, verifyResult.Err)
+		}
 		return
 	}

+	if w.isDebug() {
+		w.logger.Printf("[DEBUG] [STORAGE] Post-mount verification succeeded for %s", sp.Path)
+	}
+
 	w.logger.Printf("[INFO] [STORAGE] Drive successfully remounted: %s (%s)", sp.Path, label)

 	// Clean stale restic locks