package monitor import ( "context" "fmt" "log" "os" "os/exec" "path/filepath" "strings" "sync" "time" "gitea.dooplex.hu/admin/felhom-controller/internal/config" "gitea.dooplex.hu/admin/felhom-controller/internal/notify" "gitea.dooplex.hu/admin/felhom-controller/internal/settings" "gitea.dooplex.hu/admin/felhom-controller/internal/system" ) const ( // probeThreshold is the number of consecutive probe failures before declaring disconnected. probeThreshold = 3 // defaultProbeInterval is the normal probe interval for connected drives. defaultProbeInterval = 5 * time.Second // disconnectedProbeInterval is the slower probe interval for disconnected drives // (checking for UUID reappearance, not I/O probing). disconnectedProbeInterval = 30 * time.Second // hostFstabPath is where the host's fstab is mounted inside the container. hostFstabPath = "/host-fstab" // hostDevUUIDPath is where the host's /dev/disk/by-uuid is accessible. hostDevUUIDPath = "/host-dev/disk/by-uuid" // primaryResticSubpath is the relative path to the primary restic repo under a drive. primaryResticSubpath = "backups/primary/restic" ) // WatchdogStackInfo holds minimal stack info for the watchdog. type WatchdogStackInfo struct { Name string } // WatchdogStackProvider provides stack operations needed by the watchdog. // Defined here to avoid circular imports with the backup package. type WatchdogStackProvider interface { ListDeployedStacks() []WatchdogStackInfo GetStackHDDPath(name string) string StopStack(name string) error StartStack(name string) error } // pathProbeState tracks in-memory probe state for a single storage path. type pathProbeState struct { consecutiveFailures int lastStatus string // "connected", "disconnected" lastProbeTime time.Time probeInterval time.Duration // Debug counters for summary logging probeCount int probeOKCount int lastSummaryTime time.Time totalLatency time.Duration } // StorageWatchdog monitors registered storage paths and reacts to disconnection/reconnection. type StorageWatchdog struct { settings *settings.Settings stackProvider WatchdogStackProvider notifier *notify.Notifier cfg *config.Config logger *log.Logger // Callbacks to break import cycles — set via SetXxx methods after construction alertRefresh func() pushHubReport func() unlockRepo func(ctx context.Context, repoPath string) error mu sync.Mutex pathState map[string]*pathProbeState // Debug simulation state simulatedMu sync.RWMutex simulatedPaths map[string]bool } // NewStorageWatchdog creates a new storage watchdog. func NewStorageWatchdog( sett *settings.Settings, stackProvider WatchdogStackProvider, notifier *notify.Notifier, cfg *config.Config, logger *log.Logger, ) *StorageWatchdog { return &StorageWatchdog{ settings: sett, stackProvider: stackProvider, notifier: notifier, cfg: cfg, logger: logger, pathState: make(map[string]*pathProbeState), simulatedPaths: make(map[string]bool), } } // isDebug returns true if the logging level is set to "debug". func (w *StorageWatchdog) isDebug() bool { return w.cfg.Logging.Level == "debug" } // SetAlertRefresh sets the callback to trigger alert refresh. func (w *StorageWatchdog) SetAlertRefresh(fn func()) { w.alertRefresh = fn } // SetHubReportPusher sets the callback to push an immediate hub report. func (w *StorageWatchdog) SetHubReportPusher(fn func()) { w.pushHubReport = fn } // SetRepoUnlocker sets the callback to unlock a restic repo on reconnect. func (w *StorageWatchdog) SetRepoUnlocker(fn func(ctx context.Context, repoPath string) error) { w.unlockRepo = fn } // Check probes all registered storage paths and reacts to state changes. // Called by the scheduler every 5 seconds. func (w *StorageWatchdog) Check(ctx context.Context) error { paths := w.settings.GetStoragePaths() if len(paths) == 0 { return nil } for _, sp := range paths { select { case <-ctx.Done(): return ctx.Err() default: } state := w.getOrCreateState(sp.Path) // Rate-limit per-path probes if time.Since(state.lastProbeTime) < state.probeInterval { continue } state.lastProbeTime = time.Now() // Skip decommissioned drives entirely — no apps reference them if sp.Decommissioned { continue } // Skip simulated-disconnected paths (handled by debug UI) if w.isSimulated(sp.Path) { continue } if sp.Disconnected { w.handleReconnectCheck(ctx, sp) } else { w.handleConnectedProbe(sp, state) } } return nil } // getOrCreateState returns the in-memory probe state for a path, creating if needed. func (w *StorageWatchdog) getOrCreateState(path string) *pathProbeState { w.mu.Lock() defer w.mu.Unlock() if s, ok := w.pathState[path]; ok { return s } s := &pathProbeState{ lastStatus: "connected", probeInterval: defaultProbeInterval, } w.pathState[path] = s return s } // handleConnectedProbe probes a connected drive and triggers disconnect if needed. func (w *StorageWatchdog) handleConnectedProbe(sp settings.StoragePath, state *pathProbeState) { probeStart := time.Now() result := system.ProbeStoragePath(sp.Path) probeLatency := time.Since(probeStart) if w.isDebug() { state.probeCount++ state.totalLatency += probeLatency } if result.Status == system.ProbeConnected { if state.consecutiveFailures > 0 { w.logger.Printf("[DEBUG] [STORAGE] Probe recovered for %s after %d failures", sp.Path, state.consecutiveFailures) } state.consecutiveFailures = 0 state.lastStatus = "connected" if w.isDebug() { state.probeOKCount++ // Every 60 probes (~5 minutes at 5s interval): emit summary if state.probeCount >= 60 { avgLatency := state.totalLatency / time.Duration(state.probeCount) w.logger.Printf("[DEBUG] [STORAGE] Storage watchdog: %s — %d/%d probes OK (last 5m, avg %dms)", sp.Path, state.probeOKCount, state.probeCount, avgLatency.Milliseconds()) state.probeCount = 0 state.probeOKCount = 0 state.totalLatency = 0 state.lastSummaryTime = time.Now() } } return } state.consecutiveFailures++ // Debug: log immediately on unexpected failure (was connected, now failing) if w.isDebug() && state.lastStatus == "connected" { w.logger.Printf("[DEBUG] [STORAGE] Storage probe failed for %s (%d/%d before disconnect): %v", sp.Path, state.consecutiveFailures, probeThreshold, result.Err) } w.logger.Printf("[WARN] [STORAGE] Probe failed for %s (%d/%d): %v", sp.Path, state.consecutiveFailures, probeThreshold, result.Err) if state.consecutiveFailures >= probeThreshold { w.handleDisconnect(sp, state, result) } } // handleDisconnect reacts to a confirmed drive disconnection. func (w *StorageWatchdog) handleDisconnect(sp settings.StoragePath, state *pathProbeState, probe system.ProbeResult) { label := sp.Label if label == "" { label = sp.Path } w.logger.Printf("[ERROR] [STORAGE] Drive disconnected: %s (%s)", sp.Path, label) // 1. Find and stop affected stacks stoppedStacks := w.stopAffectedStacks(sp.Path) // 2. Mark disconnected in settings (persists to settings.json) if err := w.settings.SetDisconnected(sp.Path, true, stoppedStacks); err != nil { w.logger.Printf("[ERROR] [STORAGE] Failed to mark disconnected: %v", err) } // 3. Lazy unmount stale mount (if probe timed out — mount is likely hanging) if probe.Status == system.ProbeTimeout { w.lazyUnmount(sp.Path) } // 4. Update in-memory state state.lastStatus = "disconnected" state.probeInterval = disconnectedProbeInterval state.consecutiveFailures = 0 // 5. Trigger alert refresh if w.alertRefresh != nil { w.alertRefresh() } // 6. Send notification w.notifier.NotifyStorageDisconnected(label, stoppedStacks) // 7. Push immediate hub report if w.pushHubReport != nil { go w.pushHubReport() } } // handleReconnectCheck checks if a disconnected drive has been reconnected. func (w *StorageWatchdog) handleReconnectCheck(ctx context.Context, sp settings.StoragePath) { // Find the UUID for this path from fstab // For attach-wizard drives, the UUID is on the raw mount, not the bind mount mountPath := sp.Path rawPath, isAttachWizard := system.HasFelhomRawMount(hostFstabPath, sp.Path) if isAttachWizard { mountPath = rawPath } uuid := system.ParseFstabUUID(hostFstabPath, mountPath) if uuid == "" { // No UUID in fstab — can't detect reconnection automatically return } if w.isDebug() { w.logger.Printf("[DEBUG] [STORAGE] Reconnect check for %s: UUID=%s, mountPath=%s, isAttachWizard=%v", sp.Path, uuid, mountPath, isAttachWizard) } // Check if the UUID block device is present uuidPath := filepath.Join(hostDevUUIDPath, uuid) if _, err := os.Stat(uuidPath); err != nil { return // Drive not reconnected yet } label := sp.Label if label == "" { label = sp.Path } w.logger.Printf("[INFO] [STORAGE] Drive reconnected (UUID found), attempting remount: %s (%s)", sp.Path, label) if w.isDebug() { w.logger.Printf("[DEBUG] [STORAGE] UUID %s found at %s, mounting %s (raw=%s, attachWizard=%v)", uuid, uuidPath, sp.Path, rawPath, isAttachWizard) } // Attempt remount if err := w.remount(sp.Path, rawPath, isAttachWizard); err != nil { w.logger.Printf("[ERROR] [STORAGE] Remount failed for %s: %v", sp.Path, err) return // Try again next cycle } // Verify with a probe verifyResult := system.ProbeStoragePath(sp.Path) if verifyResult.Status != system.ProbeConnected { w.logger.Printf("[ERROR] [STORAGE] Post-remount probe failed for %s: %v", sp.Path, verifyResult.Err) if w.isDebug() { w.logger.Printf("[DEBUG] [STORAGE] Post-mount verification failed for %s: status=%v, err=%v", sp.Path, verifyResult.Status, verifyResult.Err) } return } if w.isDebug() { w.logger.Printf("[DEBUG] [STORAGE] Post-mount verification succeeded for %s", sp.Path) } w.logger.Printf("[INFO] [STORAGE] Drive successfully remounted: %s (%s)", sp.Path, label) // Clean stale restic locks w.cleanResticLocks(ctx, sp.Path) // Validate stopped stacks — filter to only actually stopped ones filteredStacks := w.filterStoppedStacks(sp.StoppedStacks) // Clear disconnected but preserve StoppedStacks for the restart UI if err := w.settings.SetDisconnected(sp.Path, false, filteredStacks); err != nil { w.logger.Printf("[ERROR] [STORAGE] Failed to clear disconnected: %v", err) } // Update in-memory state state := w.getOrCreateState(sp.Path) state.lastStatus = "connected" state.probeInterval = defaultProbeInterval state.consecutiveFailures = 0 // Trigger alert refresh if w.alertRefresh != nil { w.alertRefresh() } // Send notification w.notifier.NotifyStorageReconnected(label) // Push immediate hub report if w.pushHubReport != nil { go w.pushHubReport() } } // stopAffectedStacks stops all deployed stacks whose HDD_PATH matches the disconnected drive. func (w *StorageWatchdog) stopAffectedStacks(drivePath string) []string { if w.stackProvider == nil { return nil } var stopped []string cleanDrive := filepath.Clean(drivePath) for _, stack := range w.stackProvider.ListDeployedStacks() { hddPath := w.stackProvider.GetStackHDDPath(stack.Name) if hddPath == "" { continue } cleanHDD := filepath.Clean(hddPath) if cleanHDD != cleanDrive && !strings.HasPrefix(cleanHDD, cleanDrive+"/") { continue } // Don't stop protected stacks if w.cfg.IsProtectedStack(stack.Name) { w.logger.Printf("[WARN] [STORAGE] Skipping protected stack: %s", stack.Name) continue } w.logger.Printf("[INFO] [STORAGE] Stopping stack %s (drive disconnected: %s)", stack.Name, drivePath) if err := w.stackProvider.StopStack(stack.Name); err != nil { w.logger.Printf("[ERROR] [STORAGE] Failed to stop stack %s: %v", stack.Name, err) continue // Don't add to stopped list if stop failed } stopped = append(stopped, stack.Name) } if len(stopped) > 0 { w.logger.Printf("[INFO] [STORAGE] Stopped %d stack(s) due to drive disconnect: %v", len(stopped), stopped) } return stopped } // lazyUnmount performs a lazy unmount of a path and its raw mount (if attach-wizard). func (w *StorageWatchdog) lazyUnmount(path string) { // For attach-wizard, unmount bind first, then raw rawPath, isAttachWizard := system.HasFelhomRawMount(hostFstabPath, path) // Unmount the bind/main path cmd := exec.Command("umount", "-l", path) if out, err := cmd.CombinedOutput(); err != nil { w.logger.Printf("[WARN] [STORAGE] umount -l %s: %v (%s)", path, err, strings.TrimSpace(string(out))) } else { w.logger.Printf("[INFO] [STORAGE] Lazy unmounted: %s", path) } // Then unmount the raw path if it's an attach-wizard drive if isAttachWizard && rawPath != "" { cmd = exec.Command("umount", "-l", rawPath) if out, err := cmd.CombinedOutput(); err != nil { w.logger.Printf("[WARN] [STORAGE] umount -l %s: %v (%s)", rawPath, err, strings.TrimSpace(string(out))) } else { w.logger.Printf("[INFO] [STORAGE] Lazy unmounted raw: %s", rawPath) } } } // remount attempts to remount a storage path using fstab entries. func (w *StorageWatchdog) remount(path, rawPath string, isAttachWizard bool) error { // Clean any stale mount entries first exec.Command("umount", "-l", path).Run() if isAttachWizard && rawPath != "" { exec.Command("umount", "-l", rawPath).Run() } if isAttachWizard && rawPath != "" { // Mount raw first, then bind cmd := exec.Command("mount", "-T", hostFstabPath, rawPath) if out, err := cmd.CombinedOutput(); err != nil { return fmt.Errorf("mount raw %s: %v (%s)", rawPath, err, strings.TrimSpace(string(out))) } w.logger.Printf("[INFO] [STORAGE] Mounted raw: %s", rawPath) cmd = exec.Command("mount", "-T", hostFstabPath, path) if out, err := cmd.CombinedOutput(); err != nil { return fmt.Errorf("mount bind %s: %v (%s)", path, err, strings.TrimSpace(string(out))) } w.logger.Printf("[INFO] [STORAGE] Mounted bind: %s", path) } else { cmd := exec.Command("mount", "-T", hostFstabPath, path) if out, err := cmd.CombinedOutput(); err != nil { return fmt.Errorf("mount %s: %v (%s)", path, err, strings.TrimSpace(string(out))) } w.logger.Printf("[INFO] [STORAGE] Mounted: %s", path) } return nil } // cleanResticLocks runs restic unlock on the primary repo for a drive path. func (w *StorageWatchdog) cleanResticLocks(ctx context.Context, drivePath string) { repoPath := filepath.Join(drivePath, primaryResticSubpath) locksDir := filepath.Join(repoPath, "locks") entries, err := os.ReadDir(locksDir) if err != nil || len(entries) == 0 { return // No locks dir or no lock files } w.logger.Printf("[INFO] [STORAGE] Found %d restic lock file(s) in %s, running unlock", len(entries), repoPath) if w.unlockRepo != nil { if err := w.unlockRepo(ctx, repoPath); err != nil { w.logger.Printf("[WARN] [STORAGE] Restic unlock failed for %s: %v", repoPath, err) } } } // filterStoppedStacks validates that stacks in the list still exist as deployed stacks. func (w *StorageWatchdog) filterStoppedStacks(stackNames []string) []string { if w.stackProvider == nil || len(stackNames) == 0 { return nil } deployed := make(map[string]bool) for _, s := range w.stackProvider.ListDeployedStacks() { deployed[s.Name] = true } var result []string for _, name := range stackNames { if deployed[name] { result = append(result, name) } } return result } // SafeDisconnect performs a safe disconnect of a storage path. // Stops affected apps, syncs filesystem, and unmounts the drive. func (w *StorageWatchdog) SafeDisconnect(ctx context.Context, path string) (stoppedStacks []string, err error) { sp := w.findStoragePath(path) if sp == nil { return nil, fmt.Errorf("storage path %q not found", path) } if sp.Disconnected { return nil, fmt.Errorf("drive already disconnected") } if sp.Decommissioned { return nil, fmt.Errorf("drive is decommissioned — no apps to stop") } label := sp.Label if label == "" { label = sp.Path } w.logger.Printf("[INFO] [STORAGE] Safe disconnect requested: %s (%s)", path, label) // 1. Stop affected stacks stoppedStacks = w.stopAffectedStacks(path) // 2. Sync filesystem exec.Command("sync").Run() // 3. Unmount rawPath, isAttachWizard := system.HasFelhomRawMount(hostFstabPath, path) // Unmount bind/main cmd := exec.Command("umount", path) if out, umountErr := cmd.CombinedOutput(); umountErr != nil { // Try lazy unmount as fallback w.logger.Printf("[WARN] [STORAGE] umount %s failed, trying lazy: %v", path, umountErr) cmd = exec.Command("umount", "-l", path) if out, umountErr = cmd.CombinedOutput(); umountErr != nil { return stoppedStacks, fmt.Errorf("umount %s failed: %v (%s)", path, umountErr, strings.TrimSpace(string(out))) } } // Unmount raw if attach-wizard if isAttachWizard && rawPath != "" { cmd = exec.Command("umount", rawPath) if out, umountErr := cmd.CombinedOutput(); umountErr != nil { cmd = exec.Command("umount", "-l", rawPath) if out, umountErr = cmd.CombinedOutput(); umountErr != nil { w.logger.Printf("[WARN] [STORAGE] umount raw %s failed: %v (%s)", rawPath, umountErr, strings.TrimSpace(string(out))) } } } // 4. Mark disconnected if setErr := w.settings.SetDisconnected(path, true, stoppedStacks); setErr != nil { w.logger.Printf("[ERROR] [STORAGE] Failed to mark disconnected: %v", setErr) } // 5. Update in-memory state state := w.getOrCreateState(path) state.lastStatus = "disconnected" state.probeInterval = disconnectedProbeInterval state.consecutiveFailures = 0 // 6. Trigger alert refresh if w.alertRefresh != nil { w.alertRefresh() } // 7. Notify and push hub report w.notifier.Notify("storage_safe_disconnect", "info", fmt.Sprintf("Meghajtó biztonságosan leválasztva: %s", label), "") if w.pushHubReport != nil { go w.pushHubReport() } w.logger.Printf("[INFO] [STORAGE] Safe disconnect completed: %s — drive can be removed", path) return stoppedStacks, nil } // Reconnect attempts to remount a disconnected storage path. func (w *StorageWatchdog) Reconnect(ctx context.Context, path string) (stoppedStacks []string, err error) { sp := w.findStoragePath(path) if sp == nil { return nil, fmt.Errorf("storage path %q not found", path) } if !sp.Disconnected { return nil, fmt.Errorf("drive is not disconnected") } label := sp.Label if label == "" { label = sp.Path } // Check UUID availability mountPath := sp.Path rawPath, isAttachWizard := system.HasFelhomRawMount(hostFstabPath, sp.Path) if isAttachWizard { mountPath = rawPath } uuid := system.ParseFstabUUID(hostFstabPath, mountPath) if uuid != "" { uuidPath := filepath.Join(hostDevUUIDPath, uuid) if _, statErr := os.Stat(uuidPath); statErr != nil { return nil, fmt.Errorf("drive not detected (UUID %s not found) — ensure the drive is physically connected", uuid) } } // Attempt remount if mountErr := w.remount(path, rawPath, isAttachWizard); mountErr != nil { return nil, fmt.Errorf("mount failed: %w", mountErr) } // Verify verifyResult := system.ProbeStoragePath(path) if verifyResult.Status != system.ProbeConnected { return nil, fmt.Errorf("mount appeared to succeed but probe failed: %v", verifyResult.Err) } // Clean restic locks w.cleanResticLocks(ctx, path) // Validate stopped stacks filteredStacks := w.filterStoppedStacks(sp.StoppedStacks) // Clear disconnected, preserve stopped stacks for restart UI if setErr := w.settings.SetDisconnected(path, false, filteredStacks); setErr != nil { w.logger.Printf("[ERROR] [STORAGE] Failed to clear disconnected: %v", setErr) } // Update in-memory state state := w.getOrCreateState(path) state.lastStatus = "connected" state.probeInterval = defaultProbeInterval state.consecutiveFailures = 0 // Trigger alert refresh if w.alertRefresh != nil { w.alertRefresh() } // Notify w.notifier.NotifyStorageReconnected(label) if w.pushHubReport != nil { go w.pushHubReport() } w.logger.Printf("[INFO] [STORAGE] Reconnect completed: %s", path) return filteredStacks, nil } // RestartStoppedApps restarts apps that were auto-stopped due to a drive disconnect. func (w *StorageWatchdog) RestartStoppedApps(path string) (started, failed []string) { sp := w.findStoragePath(path) if sp == nil || sp.Disconnected { return nil, nil } stacks := w.settings.GetStoppedStacks(path) if len(stacks) == 0 { return nil, nil } for _, name := range stacks { w.logger.Printf("[INFO] [STORAGE] Starting stack %s (drive reconnected: %s)", name, path) if err := w.stackProvider.StartStack(name); err != nil { w.logger.Printf("[ERROR] [STORAGE] Failed to start stack %s: %v", name, err) failed = append(failed, name) } else { started = append(started, name) } } // Clear stopped stacks list if err := w.settings.ClearStoppedStacks(path); err != nil { w.logger.Printf("[ERROR] [STORAGE] Failed to clear stopped stacks: %v", err) } return started, failed } // ── Debug simulation methods ───────────────────────────────────────── // isSimulated returns true if the path is in simulated-disconnect state. func (w *StorageWatchdog) isSimulated(path string) bool { w.simulatedMu.RLock() defer w.simulatedMu.RUnlock() return w.simulatedPaths[path] } // SimulateDisconnect simulates a drive disconnection without actually unmounting. // Runs disconnect steps 1,2,4,5,6,7 (skips step 3: lazyUnmount). // Returns the list of stopped stacks. func (w *StorageWatchdog) SimulateDisconnect(ctx context.Context, path string) ([]string, error) { sp := w.findStoragePath(path) if sp == nil { return nil, fmt.Errorf("storage path %q not found", path) } if sp.Disconnected { return nil, fmt.Errorf("drive already disconnected") } if sp.Decommissioned { return nil, fmt.Errorf("drive is decommissioned") } label := sp.Label if label == "" { label = sp.Path } w.logger.Printf("[INFO] [STORAGE] [DEBUG-SIM] Simulating disconnect: %s (%s)", path, label) // Mark as simulated so the watchdog skips probing this path w.simulatedMu.Lock() w.simulatedPaths[path] = true w.simulatedMu.Unlock() // Step 1: Stop affected stacks stoppedStacks := w.stopAffectedStacks(path) // Step 2: Mark disconnected in settings if err := w.settings.SetDisconnected(path, true, stoppedStacks); err != nil { w.logger.Printf("[ERROR] [STORAGE] [DEBUG-SIM] Failed to mark disconnected: %v", err) } // Step 3: SKIPPED (no lazyUnmount — drive stays physically mounted) // Step 4: Update in-memory state state := w.getOrCreateState(path) state.lastStatus = "disconnected" state.probeInterval = disconnectedProbeInterval state.consecutiveFailures = 0 // Step 5: Trigger alert refresh if w.alertRefresh != nil { w.alertRefresh() } // Step 6: Send notification w.notifier.NotifyStorageDisconnected(label, stoppedStacks) // Step 7: Push hub report if w.pushHubReport != nil { go w.pushHubReport() } w.logger.Printf("[INFO] [STORAGE] [DEBUG-SIM] Disconnect simulated: %s — %d stack(s) stopped", path, len(stoppedStacks)) return stoppedStacks, nil } // SimulateReconnect undoes a simulated disconnection. func (w *StorageWatchdog) SimulateReconnect(ctx context.Context, path string) error { if !w.isSimulated(path) { return fmt.Errorf("path %q is not in simulated-disconnect state", path) } sp := w.findStoragePath(path) if sp == nil { return fmt.Errorf("storage path %q not found", path) } label := sp.Label if label == "" { label = sp.Path } w.logger.Printf("[INFO] [STORAGE] [DEBUG-SIM] Simulating reconnect: %s (%s)", path, label) // Remove from simulated set w.simulatedMu.Lock() delete(w.simulatedPaths, path) w.simulatedMu.Unlock() // Verify drive is actually still mounted (it should be since we never unmounted) verifyResult := system.ProbeStoragePath(path) if verifyResult.Status != system.ProbeConnected { return fmt.Errorf("drive probe failed after simulation clear: %v", verifyResult.Err) } // Clean restic locks w.cleanResticLocks(ctx, path) // Validate stopped stacks filteredStacks := w.filterStoppedStacks(sp.StoppedStacks) // Clear disconnected, preserve stopped stacks for restart UI if err := w.settings.SetDisconnected(path, false, filteredStacks); err != nil { w.logger.Printf("[ERROR] [STORAGE] [DEBUG-SIM] Failed to clear disconnected: %v", err) } // Update in-memory state state := w.getOrCreateState(path) state.lastStatus = "connected" state.probeInterval = defaultProbeInterval state.consecutiveFailures = 0 // Trigger alert refresh if w.alertRefresh != nil { w.alertRefresh() } // Send notification w.notifier.NotifyStorageReconnected(label) if w.pushHubReport != nil { go w.pushHubReport() } w.logger.Printf("[INFO] [STORAGE] [DEBUG-SIM] Reconnect simulated: %s", path) return nil } // PathDebugStatus holds per-path probe state for the debug page. type PathDebugStatus struct { Path string `json:"path"` Label string `json:"label"` Status string `json:"status"` Simulated bool `json:"simulated"` ProbeOK bool `json:"probe_ok"` DebounceCount int `json:"debounce_count"` DebounceMax int `json:"debounce_max"` LastProbe time.Time `json:"last_probe"` AvgLatencyMs float64 `json:"avg_latency_ms"` ProbeCount int `json:"probe_count"` ProbeOKCount int `json:"probe_ok_count"` } // GetDebugStatus returns per-path probe state for the debug page. func (w *StorageWatchdog) GetDebugStatus() []PathDebugStatus { paths := w.settings.GetStoragePaths() result := make([]PathDebugStatus, 0, len(paths)) w.mu.Lock() defer w.mu.Unlock() for _, sp := range paths { if sp.Decommissioned { continue } ds := PathDebugStatus{ Path: sp.Path, Label: sp.Label, DebounceMax: probeThreshold, } if sp.Disconnected { ds.Status = "disconnected" } else { ds.Status = "connected" } ds.Simulated = w.isSimulatedLocked(sp.Path) if state, ok := w.pathState[sp.Path]; ok { ds.DebounceCount = state.consecutiveFailures ds.LastProbe = state.lastProbeTime ds.ProbeOK = state.lastStatus == "connected" ds.ProbeCount = state.probeCount ds.ProbeOKCount = state.probeOKCount if state.probeCount > 0 { ds.AvgLatencyMs = float64(state.totalLatency.Milliseconds()) / float64(state.probeCount) } } result = append(result, ds) } return result } // isSimulatedLocked checks simulation state without acquiring simulatedMu // (caller must hold w.mu or be ok with a racy read for debug display). func (w *StorageWatchdog) isSimulatedLocked(path string) bool { w.simulatedMu.RLock() defer w.simulatedMu.RUnlock() return w.simulatedPaths[path] } // findStoragePath returns the storage path entry for a given path, or nil. func (w *StorageWatchdog) findStoragePath(path string) *settings.StoragePath { for _, sp := range w.settings.GetStoragePaths() { if sp.Path == path { return &sp } } return nil }