v0.24.0 — Pre-testing observability: debug logging, diagnostic dump, startup self-test
- Add [DEBUG] logging across all modules (backup, storage, sync, selfupdate, monitor, notify, report, assets, setup) gated behind logging.level: "debug" - Add /api/debug/dump endpoint returning full controller state JSON (debug only) - Add startup self-test validating 9 subsystems (Docker, dirs, storage, hub, restic repos, metrics DB) with pass/warn/fail summary - New packages: internal/selftest, internal/util - Constructor/signature changes: debug bool params, logger params on RunHealthCheck and BuildReport, smart watchdog probe logging Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -58,6 +58,11 @@ type pathProbeState struct {
|
||||
lastStatus string // "connected", "disconnected"
|
||||
lastProbeTime time.Time
|
||||
probeInterval time.Duration
|
||||
// Debug counters for summary logging
|
||||
probeCount int
|
||||
probeOKCount int
|
||||
lastSummaryTime time.Time
|
||||
totalLatency time.Duration
|
||||
}
|
||||
|
||||
// StorageWatchdog monitors registered storage paths and reacts to disconnection/reconnection.
|
||||
@@ -95,6 +100,9 @@ func NewStorageWatchdog(
|
||||
}
|
||||
}
|
||||
|
||||
// isDebug returns true if the logging level is set to "debug".
|
||||
func (w *StorageWatchdog) isDebug() bool { return w.cfg.Logging.Level == "debug" }
|
||||
|
||||
// SetAlertRefresh sets the callback to trigger alert refresh.
|
||||
func (w *StorageWatchdog) SetAlertRefresh(fn func()) {
|
||||
w.alertRefresh = fn
|
||||
@@ -164,17 +172,45 @@ func (w *StorageWatchdog) getOrCreateState(path string) *pathProbeState {
|
||||
|
||||
// handleConnectedProbe probes a connected drive and triggers disconnect if needed.
|
||||
func (w *StorageWatchdog) handleConnectedProbe(sp settings.StoragePath, state *pathProbeState) {
|
||||
probeStart := time.Now()
|
||||
result := system.ProbeStoragePath(sp.Path)
|
||||
probeLatency := time.Since(probeStart)
|
||||
|
||||
if w.isDebug() {
|
||||
state.probeCount++
|
||||
state.totalLatency += probeLatency
|
||||
}
|
||||
|
||||
if result.Status == system.ProbeConnected {
|
||||
if state.consecutiveFailures > 0 {
|
||||
w.logger.Printf("[DEBUG] [STORAGE] Probe recovered for %s after %d failures", sp.Path, state.consecutiveFailures)
|
||||
}
|
||||
state.consecutiveFailures = 0
|
||||
state.lastStatus = "connected"
|
||||
if w.isDebug() {
|
||||
state.probeOKCount++
|
||||
// Every 60 probes (~5 minutes at 5s interval): emit summary
|
||||
if state.probeCount >= 60 {
|
||||
avgLatency := state.totalLatency / time.Duration(state.probeCount)
|
||||
w.logger.Printf("[DEBUG] [STORAGE] Storage watchdog: %s — %d/%d probes OK (last 5m, avg %dms)",
|
||||
sp.Path, state.probeOKCount, state.probeCount, avgLatency.Milliseconds())
|
||||
state.probeCount = 0
|
||||
state.probeOKCount = 0
|
||||
state.totalLatency = 0
|
||||
state.lastSummaryTime = time.Now()
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
state.consecutiveFailures++
|
||||
|
||||
// Debug: log immediately on unexpected failure (was connected, now failing)
|
||||
if w.isDebug() && state.lastStatus == "connected" {
|
||||
w.logger.Printf("[DEBUG] [STORAGE] Storage probe failed for %s (%d/%d before disconnect): %v",
|
||||
sp.Path, state.consecutiveFailures, probeThreshold, result.Err)
|
||||
}
|
||||
|
||||
w.logger.Printf("[WARN] [STORAGE] Probe failed for %s (%d/%d): %v",
|
||||
sp.Path, state.consecutiveFailures, probeThreshold, result.Err)
|
||||
|
||||
@@ -239,6 +275,11 @@ func (w *StorageWatchdog) handleReconnectCheck(ctx context.Context, sp settings.
|
||||
return
|
||||
}
|
||||
|
||||
if w.isDebug() {
|
||||
w.logger.Printf("[DEBUG] [STORAGE] Reconnect check for %s: UUID=%s, mountPath=%s, isAttachWizard=%v",
|
||||
sp.Path, uuid, mountPath, isAttachWizard)
|
||||
}
|
||||
|
||||
// Check if the UUID block device is present
|
||||
uuidPath := filepath.Join(hostDevUUIDPath, uuid)
|
||||
if _, err := os.Stat(uuidPath); err != nil {
|
||||
@@ -251,6 +292,11 @@ func (w *StorageWatchdog) handleReconnectCheck(ctx context.Context, sp settings.
|
||||
}
|
||||
w.logger.Printf("[INFO] [STORAGE] Drive reconnected (UUID found), attempting remount: %s (%s)", sp.Path, label)
|
||||
|
||||
if w.isDebug() {
|
||||
w.logger.Printf("[DEBUG] [STORAGE] UUID %s found at %s, mounting %s (raw=%s, attachWizard=%v)",
|
||||
uuid, uuidPath, sp.Path, rawPath, isAttachWizard)
|
||||
}
|
||||
|
||||
// Attempt remount
|
||||
if err := w.remount(sp.Path, rawPath, isAttachWizard); err != nil {
|
||||
w.logger.Printf("[ERROR] [STORAGE] Remount failed for %s: %v", sp.Path, err)
|
||||
@@ -261,9 +307,17 @@ func (w *StorageWatchdog) handleReconnectCheck(ctx context.Context, sp settings.
|
||||
verifyResult := system.ProbeStoragePath(sp.Path)
|
||||
if verifyResult.Status != system.ProbeConnected {
|
||||
w.logger.Printf("[ERROR] [STORAGE] Post-remount probe failed for %s: %v", sp.Path, verifyResult.Err)
|
||||
if w.isDebug() {
|
||||
w.logger.Printf("[DEBUG] [STORAGE] Post-mount verification failed for %s: status=%v, err=%v",
|
||||
sp.Path, verifyResult.Status, verifyResult.Err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if w.isDebug() {
|
||||
w.logger.Printf("[DEBUG] [STORAGE] Post-mount verification succeeded for %s", sp.Path)
|
||||
}
|
||||
|
||||
w.logger.Printf("[INFO] [STORAGE] Drive successfully remounted: %s (%s)", sp.Path, label)
|
||||
|
||||
// Clean stale restic locks
|
||||
|
||||
Reference in New Issue
Block a user