fix: P0+P1 critical bug fixes across controller (24 files)
Concurrency fixes: - Deep-copy stacks in GetStack/GetStacks to prevent shared state mutation (C04) - Add per-state mutex to watchdog pathProbeState (C05) - Guard MetricsCollector.Start() with sync.Once against double-start (C06) - Hold diskJobMu across entire raw mount operation (C07) - Add mutex to SetEncryptionKey (C08), MigrateEncryption write lock (H03) - Use sync.Once for sync.Stop() channel close (H08) - Set syncing=true before releasing lock in TriggerSync (H09) - Deep-copy lastDBDump/lastBackup in GetFullStatus (H11) - Add WaitGroup for stderr goroutine in MigrateDrive (H19) - Add mutex to SetBackupRunningCheck (M18) Security fixes: - Validate Bearer token against Hub API key in CSRF middleware (H16) - Validate backup paths start with expected prefix in RemoveStack (M12) - Guard uuid[:8] slice with length check (H20) - Parse fstab fields exactly for mount target matching (H21) Bug fixes: - Use decrypted env vars for compose deploy (C01) - Log decrypt failures in DecryptMap instead of swallowing (C02) - Move Deployed=false inside lock in runComposeDeploy (C03) - Fix activeDrives() to skip disconnected drives (H02) - Fix Snapshot() stderr extraction from exec.ExitError (H01) - Check unlockCmd.Run() error in restic (H01) - Buffer template rendering via bytes.Buffer (H07) - Thread context.Context through cloudflare client (H10) - Fix leaf-name collision detection in cross-drive backup (H15) - Add nil check for crossDriveRunner (H17) - Use strings.TrimSpace instead of slice on command output (H18) - Make SaveAppConfig atomic with write-to-tmp+rename (H04) - Pass encKey on deploy failure SaveAppConfig (H05) - Fix IPv6 address format in TCP health probe Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -54,6 +54,7 @@ type WatchdogStackProvider interface {
|
||||
|
||||
// pathProbeState tracks in-memory probe state for a single storage path.
|
||||
type pathProbeState struct {
|
||||
mu sync.Mutex
|
||||
consecutiveFailures int
|
||||
lastStatus string // "connected", "disconnected"
|
||||
lastProbeTime time.Time
|
||||
@@ -141,10 +142,13 @@ func (w *StorageWatchdog) Check(ctx context.Context) error {
|
||||
state := w.getOrCreateState(sp.Path)
|
||||
|
||||
// Rate-limit per-path probes
|
||||
state.mu.Lock()
|
||||
if time.Since(state.lastProbeTime) < state.probeInterval {
|
||||
state.mu.Unlock()
|
||||
continue
|
||||
}
|
||||
state.lastProbeTime = time.Now()
|
||||
state.mu.Unlock()
|
||||
|
||||
// Skip decommissioned drives entirely — no apps reference them
|
||||
if sp.Decommissioned {
|
||||
@@ -186,6 +190,9 @@ func (w *StorageWatchdog) handleConnectedProbe(sp settings.StoragePath, state *p
|
||||
result := system.ProbeStoragePath(sp.Path)
|
||||
probeLatency := time.Since(probeStart)
|
||||
|
||||
state.mu.Lock()
|
||||
defer state.mu.Unlock()
|
||||
|
||||
if w.isDebug() {
|
||||
state.probeCount++
|
||||
state.totalLatency += probeLatency
|
||||
@@ -225,7 +232,9 @@ func (w *StorageWatchdog) handleConnectedProbe(sp settings.StoragePath, state *p
|
||||
sp.Path, state.consecutiveFailures, probeThreshold, result.Err)
|
||||
|
||||
if state.consecutiveFailures >= probeThreshold {
|
||||
state.mu.Unlock()
|
||||
w.handleDisconnect(sp, state, result)
|
||||
state.mu.Lock() // re-acquire for deferred Unlock
|
||||
}
|
||||
}
|
||||
|
||||
@@ -251,9 +260,11 @@ func (w *StorageWatchdog) handleDisconnect(sp settings.StoragePath, state *pathP
|
||||
}
|
||||
|
||||
// 4. Update in-memory state
|
||||
state.mu.Lock()
|
||||
state.lastStatus = "disconnected"
|
||||
state.probeInterval = disconnectedProbeInterval
|
||||
state.consecutiveFailures = 0
|
||||
state.mu.Unlock()
|
||||
|
||||
// 5. Trigger alert refresh
|
||||
if w.alertRefresh != nil {
|
||||
@@ -343,9 +354,11 @@ func (w *StorageWatchdog) handleReconnectCheck(ctx context.Context, sp settings.
|
||||
|
||||
// Update in-memory state
|
||||
state := w.getOrCreateState(sp.Path)
|
||||
state.mu.Lock()
|
||||
state.lastStatus = "connected"
|
||||
state.probeInterval = defaultProbeInterval
|
||||
state.consecutiveFailures = 0
|
||||
state.mu.Unlock()
|
||||
|
||||
// Trigger alert refresh
|
||||
if w.alertRefresh != nil {
|
||||
@@ -551,9 +564,11 @@ func (w *StorageWatchdog) SafeDisconnect(ctx context.Context, path string) (stop
|
||||
|
||||
// 5. Update in-memory state
|
||||
state := w.getOrCreateState(path)
|
||||
state.mu.Lock()
|
||||
state.lastStatus = "disconnected"
|
||||
state.probeInterval = disconnectedProbeInterval
|
||||
state.consecutiveFailures = 0
|
||||
state.mu.Unlock()
|
||||
|
||||
// 6. Trigger alert refresh
|
||||
if w.alertRefresh != nil {
|
||||
@@ -624,9 +639,11 @@ func (w *StorageWatchdog) Reconnect(ctx context.Context, path string) (stoppedSt
|
||||
|
||||
// Update in-memory state
|
||||
state := w.getOrCreateState(path)
|
||||
state.mu.Lock()
|
||||
state.lastStatus = "connected"
|
||||
state.probeInterval = defaultProbeInterval
|
||||
state.consecutiveFailures = 0
|
||||
state.mu.Unlock()
|
||||
|
||||
// Trigger alert refresh
|
||||
if w.alertRefresh != nil {
|
||||
@@ -720,9 +737,11 @@ func (w *StorageWatchdog) SimulateDisconnect(ctx context.Context, path string) (
|
||||
|
||||
// Step 4: Update in-memory state
|
||||
state := w.getOrCreateState(path)
|
||||
state.mu.Lock()
|
||||
state.lastStatus = "disconnected"
|
||||
state.probeInterval = disconnectedProbeInterval
|
||||
state.consecutiveFailures = 0
|
||||
state.mu.Unlock()
|
||||
|
||||
// Step 5: Trigger alert refresh
|
||||
if w.alertRefresh != nil {
|
||||
@@ -782,9 +801,11 @@ func (w *StorageWatchdog) SimulateReconnect(ctx context.Context, path string) er
|
||||
|
||||
// Update in-memory state
|
||||
state := w.getOrCreateState(path)
|
||||
state.mu.Lock()
|
||||
state.lastStatus = "connected"
|
||||
state.probeInterval = defaultProbeInterval
|
||||
state.consecutiveFailures = 0
|
||||
state.mu.Unlock()
|
||||
|
||||
// Trigger alert refresh
|
||||
if w.alertRefresh != nil {
|
||||
@@ -841,6 +862,7 @@ func (w *StorageWatchdog) GetDebugStatus() []PathDebugStatus {
|
||||
ds.Simulated = w.isSimulatedLocked(sp.Path)
|
||||
|
||||
if state, ok := w.pathState[sp.Path]; ok {
|
||||
state.mu.Lock()
|
||||
ds.DebounceCount = state.consecutiveFailures
|
||||
ds.LastProbe = state.lastProbeTime
|
||||
ds.ProbeOK = state.lastStatus == "connected"
|
||||
@@ -849,6 +871,7 @@ func (w *StorageWatchdog) GetDebugStatus() []PathDebugStatus {
|
||||
if state.probeCount > 0 {
|
||||
ds.AvgLatencyMs = float64(state.totalLatency.Milliseconds()) / float64(state.probeCount)
|
||||
}
|
||||
state.mu.Unlock()
|
||||
}
|
||||
result = append(result, ds)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user