fix: P0+P1 critical bug fixes across controller (24 files)

Concurrency fixes:
- Deep-copy stacks in GetStack/GetStacks to prevent shared state mutation (C04)
- Add per-state mutex to watchdog pathProbeState (C05)
- Guard MetricsCollector.Start() with sync.Once against double-start (C06)
- Hold diskJobMu across entire raw mount operation (C07)
- Add mutex to SetEncryptionKey (C08), MigrateEncryption write lock (H03)
- Use sync.Once for sync.Stop() channel close (H08)
- Set syncing=true before releasing lock in TriggerSync (H09)
- Deep-copy lastDBDump/lastBackup in GetFullStatus (H11)
- Add WaitGroup for stderr goroutine in MigrateDrive (H19)
- Add mutex to SetBackupRunningCheck (M18)

Security fixes:
- Validate Bearer token against Hub API key in CSRF middleware (H16)
- Validate backup paths start with expected prefix in RemoveStack (M12)
- Guard uuid[:8] slice with length check (H20)
- Parse fstab fields exactly for mount target matching (H21)

Bug fixes:
- Use decrypted env vars for compose deploy (C01)
- Log decrypt failures in DecryptMap instead of swallowing (C02)
- Move Deployed=false inside lock in runComposeDeploy (C03)
- Fix activeDrives() to skip disconnected drives (H02)
- Fix Snapshot() stderr extraction from exec.ExitError (H01)
- Check unlockCmd.Run() error in restic (H01)
- Buffer template rendering via bytes.Buffer (H07)
- Thread context.Context through cloudflare client (H10)
- Fix leaf-name collision detection in cross-drive backup (H15)
- Add nil check for crossDriveRunner (H17)
- Use strings.TrimSpace instead of slice on command output (H18)
- Make SaveAppConfig atomic with write-to-tmp+rename (H04)
- Pass encKey on deploy failure SaveAppConfig (H05)
- Fix IPv6 address format in TCP health probe

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-25 13:39:45 +01:00
parent 2ad743b66f
commit 8b8c04a487
23 changed files with 248 additions and 83 deletions
+23
View File
@@ -54,6 +54,7 @@ type WatchdogStackProvider interface {
// pathProbeState tracks in-memory probe state for a single storage path.
type pathProbeState struct {
mu sync.Mutex
consecutiveFailures int
lastStatus string // "connected", "disconnected"
lastProbeTime time.Time
@@ -141,10 +142,13 @@ func (w *StorageWatchdog) Check(ctx context.Context) error {
state := w.getOrCreateState(sp.Path)
// Rate-limit per-path probes
state.mu.Lock()
if time.Since(state.lastProbeTime) < state.probeInterval {
state.mu.Unlock()
continue
}
state.lastProbeTime = time.Now()
state.mu.Unlock()
// Skip decommissioned drives entirely — no apps reference them
if sp.Decommissioned {
@@ -186,6 +190,9 @@ func (w *StorageWatchdog) handleConnectedProbe(sp settings.StoragePath, state *p
result := system.ProbeStoragePath(sp.Path)
probeLatency := time.Since(probeStart)
state.mu.Lock()
defer state.mu.Unlock()
if w.isDebug() {
state.probeCount++
state.totalLatency += probeLatency
@@ -225,7 +232,9 @@ func (w *StorageWatchdog) handleConnectedProbe(sp settings.StoragePath, state *p
sp.Path, state.consecutiveFailures, probeThreshold, result.Err)
if state.consecutiveFailures >= probeThreshold {
state.mu.Unlock()
w.handleDisconnect(sp, state, result)
state.mu.Lock() // re-acquire for deferred Unlock
}
}
@@ -251,9 +260,11 @@ func (w *StorageWatchdog) handleDisconnect(sp settings.StoragePath, state *pathP
}
// 4. Update in-memory state
state.mu.Lock()
state.lastStatus = "disconnected"
state.probeInterval = disconnectedProbeInterval
state.consecutiveFailures = 0
state.mu.Unlock()
// 5. Trigger alert refresh
if w.alertRefresh != nil {
@@ -343,9 +354,11 @@ func (w *StorageWatchdog) handleReconnectCheck(ctx context.Context, sp settings.
// Update in-memory state
state := w.getOrCreateState(sp.Path)
state.mu.Lock()
state.lastStatus = "connected"
state.probeInterval = defaultProbeInterval
state.consecutiveFailures = 0
state.mu.Unlock()
// Trigger alert refresh
if w.alertRefresh != nil {
@@ -551,9 +564,11 @@ func (w *StorageWatchdog) SafeDisconnect(ctx context.Context, path string) (stop
// 5. Update in-memory state
state := w.getOrCreateState(path)
state.mu.Lock()
state.lastStatus = "disconnected"
state.probeInterval = disconnectedProbeInterval
state.consecutiveFailures = 0
state.mu.Unlock()
// 6. Trigger alert refresh
if w.alertRefresh != nil {
@@ -624,9 +639,11 @@ func (w *StorageWatchdog) Reconnect(ctx context.Context, path string) (stoppedSt
// Update in-memory state
state := w.getOrCreateState(path)
state.mu.Lock()
state.lastStatus = "connected"
state.probeInterval = defaultProbeInterval
state.consecutiveFailures = 0
state.mu.Unlock()
// Trigger alert refresh
if w.alertRefresh != nil {
@@ -720,9 +737,11 @@ func (w *StorageWatchdog) SimulateDisconnect(ctx context.Context, path string) (
// Step 4: Update in-memory state
state := w.getOrCreateState(path)
state.mu.Lock()
state.lastStatus = "disconnected"
state.probeInterval = disconnectedProbeInterval
state.consecutiveFailures = 0
state.mu.Unlock()
// Step 5: Trigger alert refresh
if w.alertRefresh != nil {
@@ -782,9 +801,11 @@ func (w *StorageWatchdog) SimulateReconnect(ctx context.Context, path string) er
// Update in-memory state
state := w.getOrCreateState(path)
state.mu.Lock()
state.lastStatus = "connected"
state.probeInterval = defaultProbeInterval
state.consecutiveFailures = 0
state.mu.Unlock()
// Trigger alert refresh
if w.alertRefresh != nil {
@@ -841,6 +862,7 @@ func (w *StorageWatchdog) GetDebugStatus() []PathDebugStatus {
ds.Simulated = w.isSimulatedLocked(sp.Path)
if state, ok := w.pathState[sp.Path]; ok {
state.mu.Lock()
ds.DebounceCount = state.consecutiveFailures
ds.LastProbe = state.lastProbeTime
ds.ProbeOK = state.lastStatus == "connected"
@@ -849,6 +871,7 @@ func (w *StorageWatchdog) GetDebugStatus() []PathDebugStatus {
if state.probeCount > 0 {
ds.AvgLatencyMs = float64(state.totalLatency.Milliseconds()) / float64(state.probeCount)
}
state.mu.Unlock()
}
result = append(result, ds)
}