fix: P0+P1 critical bug fixes across controller (24 files)

Concurrency fixes:
- Deep-copy stacks in GetStack/GetStacks to prevent shared state mutation (C04)
- Add per-state mutex to watchdog pathProbeState (C05)
- Guard MetricsCollector.Start() with sync.Once against double-start (C06)
- Hold diskJobMu across entire raw mount operation (C07)
- Add mutex to SetEncryptionKey (C08), MigrateEncryption write lock (H03)
- Use sync.Once for sync.Stop() channel close (H08)
- Set syncing=true before releasing lock in TriggerSync (H09)
- Deep-copy lastDBDump/lastBackup in GetFullStatus (H11)
- Add WaitGroup for stderr goroutine in MigrateDrive (H19)
- Add mutex to SetBackupRunningCheck (M18)

Security fixes:
- Validate Bearer token against Hub API key in CSRF middleware (H16)
- Validate backup paths start with expected prefix in RemoveStack (M12)
- Guard uuid[:8] slice with length check (H20)
- Parse fstab fields exactly for mount target matching (H21)

Bug fixes:
- Use decrypted env vars for compose deploy (C01)
- Log decrypt failures in DecryptMap instead of swallowing (C02)
- Move Deployed=false inside lock in runComposeDeploy (C03)
- Fix activeDrives() to skip disconnected drives (H02)
- Fix Snapshot() stderr extraction from exec.ExitError (H01)
- Check unlockCmd.Run() error in restic (H01)
- Buffer template rendering via bytes.Buffer (H07)
- Thread context.Context through cloudflare client (H10)
- Fix leaf-name collision detection in cross-drive backup (H15)
- Add nil check for crossDriveRunner (H17)
- Use strings.TrimSpace instead of slice on command output (H18)
- Make SaveAppConfig atomic with write-to-tmp+rename (H04)
- Pass encKey on deploy failure SaveAppConfig (H05)
- Fix IPv6 address format in TCP health probe

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-25 13:39:45 +01:00
parent 2ad743b66f
commit 8b8c04a487
23 changed files with 248 additions and 83 deletions
+21 -7
View File
@@ -203,20 +203,22 @@ func (m *Manager) groupStacksByDrive() map[string][]StackSummary {
}
// activeDrives returns sorted list of drives that have deployed apps.
// Disconnected and decommissioned drives are excluded.
func (m *Manager) activeDrives() []string {
groups := m.groupStacksByDrive()
var drives []string
var disconnected []string
var skipped []string
for d := range groups {
if m.settings != nil && (m.settings.IsDisconnected(d) || m.settings.IsDecommissioned(d)) {
disconnected = append(disconnected, d)
skipped = append(skipped, d)
continue
}
drives = append(drives, d)
}
sort.Strings(drives)
if m.isDebug() {
m.logger.Printf("[DEBUG] activeDrives: %d total (%s), %d disconnected/decommissioned",
len(drives), strings.Join(drives, ", "), len(disconnected))
m.logger.Printf("[DEBUG] activeDrives: %d active (%s), %d skipped (disconnected/decommissioned)",
len(drives), strings.Join(drives, ", "), len(skipped))
}
return drives
}
@@ -1211,11 +1213,10 @@ func (m *Manager) GetFullStatus(nextDBDump, nextBackup time.Time) *FullBackupSta
}
// No cache yet — return a minimal status (first page load before cache is populated)
return &FullBackupStatus{
// Deep-copy lastDBDump and lastBackup to prevent callers from mutating shared state.
status := &FullBackupStatus{
Enabled: m.cfg.Backup.Enabled,
Running: m.running,
LastDBDump: m.lastDBDump,
LastBackup: m.lastBackup,
DBDumpSchedule: m.cfg.Backup.DBDumpSchedule,
ResticSchedule: m.cfg.Backup.ResticSchedule,
PruneSchedule: m.cfg.Backup.PruneSchedule,
@@ -1225,6 +1226,19 @@ func (m *Manager) GetFullStatus(nextDBDump, nextBackup time.Time) *FullBackupSta
LastCheckTime: m.lastCheckTime,
LastCheckOK: m.lastCheckOK,
}
if m.lastDBDump != nil {
copyDump := *m.lastDBDump
if len(m.lastDBDump.Results) > 0 {
copyDump.Results = make([]DumpResult, len(m.lastDBDump.Results))
copy(copyDump.Results, m.lastDBDump.Results)
}
status.LastDBDump = &copyDump
}
if m.lastBackup != nil {
copyBackup := *m.lastBackup
status.LastBackup = &copyBackup
}
return status
}
// isDebug returns true if logging level is "debug".
+12 -6
View File
@@ -372,7 +372,8 @@ func (r *CrossDriveRunner) runRsyncBackup(ctx context.Context, stackName, destBa
return fmt.Errorf("creating rsync dest dir: %w", err)
}
for i, srcMount := range mounts {
seen := make(map[string]bool)
for _, srcMount := range mounts {
var dstPath string
if len(mounts) == 1 {
// Single mount: rsync directly into the stack folder (no extra nesting)
@@ -380,13 +381,18 @@ func (r *CrossDriveRunner) runRsyncBackup(ctx context.Context, stackName, destBa
} else {
// Multiple mounts: use the leaf directory name as subfolder
leaf := filepath.Base(srcMount)
dstPath = filepath.Join(destDir, leaf)
// Disambiguate duplicate leaf names (e.g. two mounts both named "data")
if i > 0 {
if _, err := os.Stat(dstPath); err == nil {
dstPath = filepath.Join(destDir, fmt.Sprintf("%s_%d", leaf, i))
if seen[leaf] {
// Disambiguate duplicate leaf names (e.g. two mounts both named "data")
for j := 2; ; j++ {
candidate := fmt.Sprintf("%s_%d", leaf, j)
if !seen[candidate] {
leaf = candidate
break
}
}
}
seen[leaf] = true
dstPath = filepath.Join(destDir, leaf)
}
if err := os.MkdirAll(dstPath, 0755); err != nil {
return fmt.Errorf("creating rsync destination: %w", err)
+7 -2
View File
@@ -134,12 +134,17 @@ func (r *ResticManager) Snapshot(repoPath string, paths []string, tags []string)
cmd := r.command(ctx, repoPath, args...)
out, err := cmd.Output()
if err != nil {
// Check for stale lock
// Check for stale lock — restic writes lock errors to stderr, not stdout
errStr := string(out)
if exitErr, ok := err.(*exec.ExitError); ok {
errStr += string(exitErr.Stderr)
}
if strings.Contains(errStr, "lock") || strings.Contains(errStr, "locked") {
r.logger.Printf("[WARN] Restic repo locked — attempting unlock")
unlockCmd := r.command(ctx, repoPath, "unlock")
unlockCmd.Run()
if unlockErr := unlockCmd.Run(); unlockErr != nil {
r.logger.Printf("[WARN] Restic unlock failed: %v", unlockErr)
}
// Retry once
cmd = r.command(ctx, repoPath, args...)
out, err = cmd.Output()