package backup import ( "context" "fmt" "log" "os" "os/exec" "path/filepath" "strings" "sync" "time" "gitea.dooplex.hu/admin/felhom-controller/internal/config" "gitea.dooplex.hu/admin/felhom-controller/internal/settings" ) // Manager orchestrates app-data backups: database dumps and Docker-volume tars. // // Disk-tier backup (restic, cross-drive, drive-recovery, infra-backup) has been // moved out of the controller into the host agent (slice 8C). This Manager now // only owns the app-data domain. type Manager struct { cfg *config.Config logger *log.Logger settings *settings.Settings stackProvider StackDataProvider systemDataPath string // fallback drive for SSD-only apps version string // controller version, stamped into recovery-unit manifests mu sync.Mutex lastDBDump *DBDumpStatus running bool // Cached status for page rendering (refreshed periodically) cachedStatus *FullBackupStatus cacheTime time.Time } // FullBackupStatus contains everything the backup page needs. type FullBackupStatus struct { Enabled bool Running bool // DB Dumps LastDBDump *DBDumpStatus DumpFiles []DumpFileInfo DiscoveredDBs []DiscoveredDB // Schedule DBDumpSchedule string NextDBDump time.Time // App data backup AppDataInfo []AppBackupInfo // Flash messages (set by handlers, passed through redirect) FlashSuccess string FlashError string } // DBDumpStatus holds the last DB dump result. type DBDumpStatus struct { LastRun time.Time Results []DumpResult Success bool Duration time.Duration } // NewManager creates a new backup manager. func NewManager(cfg *config.Config, sett *settings.Settings, logger *log.Logger) *Manager { if cfg.Paths.SystemDataPath == "" { logger.Printf("[WARN] [backup] SystemDataPath is empty in config — SSD-only apps will not have correct backup paths") } return &Manager{ cfg: cfg, logger: logger, settings: sett, systemDataPath: cfg.Paths.SystemDataPath, } } // GetAppDrivePath returns the drive path for an app. // Uses HDD_PATH from app.yaml if set, otherwise falls back to system data path. func (m *Manager) GetAppDrivePath(stackName string) string { if m.stackProvider != nil { if hddPath := m.stackProvider.GetStackHDDPath(stackName); hddPath != "" { return hddPath } } if m.systemDataPath == "" { m.logger.Printf("[ERROR] [backup] systemDataPath is empty — cannot determine drive for %s", stackName) } return m.systemDataPath } // namespaceRoot maps an app's drive path to its felhom-data namespace ROOT (the dir that directly // holds backups/ and appdata/). A drive-resident app's in-guest mount IS the namespace already // (Model A, slice 10 — the agent binds /felhom-data onto the guest mountpoint), so it is used // as-is; only the SSD-only system-data fallback gets the felhom-data subdir appended. This is what // keeps a drive-resident app's backups single-nested instead of .../felhom-data/felhom-data/... . func (m *Manager) namespaceRoot(drivePath string) string { return NamespaceRoot(drivePath, drivePath != m.systemDataPath) } // AppNamespaceRoot returns the felhom-data namespace root for a stack's keep-side backups, resolving // HDD-vs-system provenance internally. For callers outside this package that only know the stack // name (e.g. the API router) so they don't double-nest the felhom-data segment. func (m *Manager) AppNamespaceRoot(stackName string) string { drivePath := m.GetAppDrivePath(stackName) if drivePath == "" { return "" } return m.namespaceRoot(drivePath) } // groupStacksByDrive groups deployed stacks by their home drive path. func (m *Manager) groupStacksByDrive() map[string][]StackSummary { if m.stackProvider == nil { return nil } result := make(map[string][]StackSummary) for _, stack := range m.stackProvider.ListDeployedStacks() { drive := m.GetAppDrivePath(stack.Name) result[drive] = append(result[drive], stack) } if m.isDebug() { for drive, stacks := range result { names := make([]string, len(stacks)) for i, s := range stacks { names[i] = s.Name } m.logger.Printf("[DEBUG] groupStacksByDrive: %s → [%s]", drive, strings.Join(names, ", ")) } } return result } // RunDBDumps discovers and dumps all databases to per-drive, per-app paths. func (m *Manager) RunDBDumps(ctx context.Context) error { if err := m.acquireRunning(); err != nil { return err } defer m.releaseRunning() return m.runDBDumpsInternal(ctx) } // runDBDumpsInternal is the implementation of RunDBDumps. Caller must hold the running flag. func (m *Manager) runDBDumpsInternal(ctx context.Context) error { start := time.Now() m.logger.Printf("[INFO] [backup] Starting database dump run") dbs, err := DiscoverDatabases(ctx, m.logger, m.isDebug()) if err != nil { m.logger.Printf("[ERROR] [backup] Database discovery failed: %v", err) return err } if len(dbs) == 0 { m.logger.Printf("[INFO] [backup] No database containers found") m.mu.Lock() m.lastDBDump = &DBDumpStatus{ LastRun: time.Now(), Success: true, Duration: time.Since(start), } m.mu.Unlock() return nil } m.logger.Printf("[INFO] [backup] Discovered %d database(s): %s", len(dbs), dbNames(dbs)) // Dump each DB to its app's drive path var results []DumpResult allOK := true var summary []string var totalSize int64 for _, db := range dbs { drivePath := m.GetAppDrivePath(db.StackName) // Skip if drive is disconnected or decommissioned if m.settings != nil && m.settings.IsDisconnected(drivePath) { m.logger.Printf("[WARN] [backup] Skipping DB dump for %s — drive disconnected: %s", db.StackName, drivePath) summary = append(summary, fmt.Sprintf("SKIP %s (drive disconnected)", db.ContainerName)) continue } if m.settings != nil && m.settings.IsDecommissioned(drivePath) { m.logger.Printf("[WARN] [backup] Skipping DB dump for %s — drive decommissioned: %s", db.StackName, drivePath) summary = append(summary, fmt.Sprintf("SKIP %s (drive decommissioned)", db.ContainerName)) continue } dumpDir := AppDBDumpPath(m.namespaceRoot(drivePath), db.StackName) result := DumpOne(ctx, db, dumpDir, m.logger, m.isDebug()) results = append(results, result) if result.Error != nil { allOK = false summary = append(summary, fmt.Sprintf("FAIL %s: %v", result.DB.ContainerName, result.Error)) m.logger.Printf("[ERROR] [backup] DB dump failed for %s: %v", result.DB.ContainerName, result.Error) } else { totalSize += result.Size summary = append(summary, fmt.Sprintf("OK %s (%s)", result.DB.ContainerName, humanizeBytes(result.Size))) // Persist validation result to settings.json if m.settings != nil && result.FilePath != "" { filename := filepath.Base(result.FilePath) cache := settings.DBValidationCache{ ValidatedAt: time.Now().Format(time.RFC3339), TableCount: result.Validation.TableCount, HasHeader: result.Validation.Valid, } if !result.Validation.Valid { cache.Error = result.Validation.Error } if err := m.settings.SetDBValidation(filename, cache); err != nil { m.logger.Printf("[WARN] [backup] Failed to cache validation for %s: %v", filename, err) } } } } duration := time.Since(start) m.mu.Lock() m.lastDBDump = &DBDumpStatus{ LastRun: time.Now(), Results: results, Success: allOK, Duration: duration, } m.mu.Unlock() if allOK { m.logger.Printf("[INFO] [backup] DB dump completed: %d databases, %s total (%s)", len(results), humanizeBytes(totalSize), duration.Round(time.Millisecond)) } else { // Still refresh recovery units below — a partial DB failure shouldn't leave units stale. m.logger.Printf("[WARN] [backup] some database dumps failed; refreshing recovery units anyway") } // Phase 2: refresh each deployed app's self-contained recovery unit (compose + manifest). m.captureAllRecoveryUnits() if !allOK { return fmt.Errorf("some database dumps failed") } return nil } // DumpAppVolumes exports Docker named volumes to tar files for the given stack. // Tars are written to AppVolumeDumpPath(drivePath, stackName)/. // Uses "docker run alpine tar" (same pattern as appexport). func (m *Manager) DumpAppVolumes(stackName string) error { if m.stackProvider == nil { return nil } volumes := m.stackProvider.GetDockerVolumes(stackName) if len(volumes) == 0 { return nil } drivePath := m.GetAppDrivePath(stackName) if drivePath == "" { return fmt.Errorf("cannot determine drive path for %s", stackName) } dumpDir := AppVolumeDumpPath(m.namespaceRoot(drivePath), stackName) if err := os.MkdirAll(dumpDir, 0755); err != nil { return fmt.Errorf("creating volume dump dir: %w", err) } var dumpErrors []string for _, volName := range volumes { tarPath := filepath.Join(dumpDir, volName+".tar") if m.isDebug() { m.logger.Printf("[DEBUG] [backup] Dumping volume %s for %s", volName, stackName) } ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) cmd := exec.CommandContext(ctx, "docker", "run", "--rm", "-v", volName+":/vol:ro", "-v", dumpDir+":/out", "alpine", "tar", "cf", "/out/"+volName+".tar", "-C", "/vol", ".") out, err := cmd.CombinedOutput() cancel() if err != nil { m.logger.Printf("[WARN] [backup] Volume dump failed for %s/%s: %s — %v", stackName, volName, strings.TrimSpace(string(out)), err) os.Remove(tarPath) dumpErrors = append(dumpErrors, volName) continue } if info, _ := os.Stat(tarPath); info != nil { m.logger.Printf("[INFO] [backup] Volume dump: %s/%s → %s", stackName, volName, humanizeBytes(info.Size())) } } // Clean up tars for volumes that no longer exist entries, _ := os.ReadDir(dumpDir) activeVols := make(map[string]bool) for _, v := range volumes { activeVols[v+".tar"] = true } for _, e := range entries { if !activeVols[e.Name()] && strings.HasSuffix(e.Name(), ".tar") { os.Remove(filepath.Join(dumpDir, e.Name())) if m.isDebug() { m.logger.Printf("[DEBUG] [backup] Removed stale volume dump: %s/%s", stackName, e.Name()) } } } if len(dumpErrors) > 0 { return fmt.Errorf("volume dump failed for: %s", strings.Join(dumpErrors, ", ")) } return nil } // DumpAppVolumesSafe stops the stack before dumping volumes and restarts after. // Prevents inconsistent tars of live database volumes (e.g. PostgreSQL). // Protected stacks that reject StopStack will return an error — callers handle as warning. func (m *Manager) DumpAppVolumesSafe(stackName string) error { if m.stackProvider == nil { return fmt.Errorf("no stack provider") } m.logger.Printf("[INFO] [backup] Stopping %s for safe volume dump", stackName) if err := m.stackProvider.StopStack(stackName); err != nil { return fmt.Errorf("could not stop %s for volume dump: %w", stackName, err) } dumpErr := m.DumpAppVolumes(stackName) m.logger.Printf("[INFO] [backup] Restarting %s after volume dump", stackName) startErr := m.stackProvider.StartStack(stackName) if startErr != nil { m.logger.Printf("[ERROR] [backup] Failed to restart %s after volume dump: %v", stackName, startErr) } // Surface both errors — callers must know if the app is left stopped if dumpErr != nil && startErr != nil { return fmt.Errorf("volume dump failed for %s: %v; restart also failed: %v", stackName, dumpErr, startErr) } if startErr != nil { return fmt.Errorf("volume dump OK but restart failed for %s: %w", stackName, startErr) } return dumpErr } // GetStatus returns the current DB-dump status. func (m *Manager) GetStatus() *DBDumpStatus { m.mu.Lock() defer m.mu.Unlock() return m.lastDBDump } // IsRunning returns whether a backup or restore is currently in progress. func (m *Manager) IsRunning() bool { m.mu.Lock() defer m.mu.Unlock() return m.running } // acquireRunning atomically sets the running flag. Returns error if already running. func (m *Manager) acquireRunning() error { m.mu.Lock() defer m.mu.Unlock() if m.running { return fmt.Errorf("backup already in progress") } m.running = true return nil } // releaseRunning clears the running flag. func (m *Manager) releaseRunning() { m.mu.Lock() m.running = false m.mu.Unlock() } // SetStackProvider sets the stack data provider for app data discovery. // Write is protected by mutex since stackProvider is read by concurrent goroutines. func (m *Manager) SetStackProvider(provider StackDataProvider) { m.mu.Lock() m.stackProvider = provider m.mu.Unlock() } // GetStackHDDMounts returns HDD mount paths for the named stack via the stack provider. func (m *Manager) GetStackHDDMounts(name string) []string { if m.stackProvider == nil { return nil } return m.stackProvider.GetStackHDDMounts(name) } // DumpStackDB runs a database dump for containers belonging to a specific stack. // Dumps to the stack's home drive: /backups/primary//db-dumps/. func (m *Manager) DumpStackDB(ctx context.Context, stackName string) error { dbs, err := DiscoverDatabases(ctx, m.logger, m.isDebug()) if err != nil { return fmt.Errorf("database discovery failed: %w", err) } var stackDBs []DiscoveredDB for _, db := range dbs { if db.StackName == stackName { stackDBs = append(stackDBs, db) } } if len(stackDBs) == 0 { m.logger.Printf("[DEBUG] No databases found for stack %s — skipping pre-backup dump", stackName) return nil } drivePath := m.GetAppDrivePath(stackName) if drivePath == "" || !filepath.IsAbs(drivePath) { return fmt.Errorf("cannot determine absolute drive path for %s (systemDataPath not configured?)", stackName) } dumpDir := AppDBDumpPath(m.namespaceRoot(drivePath), stackName) m.logger.Printf("[INFO] [backup] Running pre-backup DB dump for %s (%d database(s)) → %s", stackName, len(stackDBs), dumpDir) for _, db := range stackDBs { result := DumpOne(ctx, db, dumpDir, m.logger, m.isDebug()) if result.Error != nil { return fmt.Errorf("DB dump failed for %s: %w", result.DB.ContainerName, result.Error) } m.logger.Printf("[INFO] [backup] Pre-backup DB dump OK: %s (%s)", result.DB.ContainerName, humanizeBytes(result.Size)) // Persist validation to settings if m.settings != nil && result.FilePath != "" { filename := filepath.Base(result.FilePath) cache := settings.DBValidationCache{ ValidatedAt: time.Now().Format(time.RFC3339), TableCount: result.Validation.TableCount, HasHeader: result.Validation.Valid, } if !result.Validation.Valid { cache.Error = result.Validation.Error } _ = m.settings.SetDBValidation(filename, cache) } } return nil } // listAllDumpFiles scans per-drive per-stack DB dump directories. func (m *Manager) listAllDumpFiles() []DumpFileInfo { var allFiles []DumpFileInfo for drive, stacks := range m.groupStacksByDrive() { for _, stack := range stacks { dumpDir := AppDBDumpPath(m.namespaceRoot(drive), stack.Name) if files, err := ListDumpFiles(dumpDir); err == nil { allFiles = append(allFiles, files...) } } } m.logger.Printf("[INFO] [backup] Found %d DB dump files across drives", len(allFiles)) return allFiles } // RefreshCache updates the cached full status. Called by scheduler every 5 minutes. func (m *Manager) RefreshCache(nextDBDump time.Time) { status := &FullBackupStatus{ Enabled: m.cfg.Backup.Enabled, DBDumpSchedule: m.cfg.Backup.DBDumpSchedule, NextDBDump: nextDBDump, } // Scan dump files from per-drive per-stack paths files := m.listAllDumpFiles() status.DumpFiles = files ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() if dbs, err := DiscoverDatabases(ctx, m.logger, m.isDebug()); err == nil { status.DiscoveredDBs = dbs } // Discover app data — all deployed stacks, backup is mandatory if m.stackProvider != nil { status.AppDataInfo = DiscoverAppData(m.stackProvider, status.DiscoveredDBs) // Phase 2: keep each app's recovery unit current with its definition. Idempotent // (checksum-skip), so this periodic refresh only writes when the config actually changed, // and ensures units exist shortly after startup without waiting for the daily DB dump. m.captureAllRecoveryUnits() } // Fill in dynamic fields under lock. m.mu.Lock() status.Running = m.running status.LastDBDump = m.lastDBDump // Cross-check lastDBDump results inside lock to prevent torn writes. if m.lastDBDump != nil && len(files) > 0 { fileValidation := make(map[string]DumpValidation) // keyed by filename for _, f := range files { fileValidation[f.FileName] = f.Validation } for i, r := range m.lastDBDump.Results { if !r.Validation.Valid && r.Validation.Error == "" && r.FilePath != "" { filename := filepath.Base(r.FilePath) if fv, ok := fileValidation[filename]; ok { m.lastDBDump.Results[i].Validation = fv m.logger.Printf("[INFO] [backup] Re-validated %s from disk: valid=%v tables=%d", filename, fv.Valid, fv.TableCount) } } } } m.cachedStatus = status m.cacheTime = time.Now() m.mu.Unlock() m.logger.Printf("[INFO] [backup] Backup status cache refreshed") } // GetFullStatus returns the cached backup status for page rendering. // Returns instantly — no subprocess calls. // Returns a deep copy so callers can safely append to slice fields without // polluting the cache. func (m *Manager) GetFullStatus(nextDBDump time.Time) *FullBackupStatus { m.mu.Lock() defer m.mu.Unlock() if m.cachedStatus != nil { status := *m.cachedStatus status.AppDataInfo = make([]AppBackupInfo, len(m.cachedStatus.AppDataInfo)) copy(status.AppDataInfo, m.cachedStatus.AppDataInfo) // Update dynamic fields that don't need subprocess calls status.Running = m.running status.NextDBDump = nextDBDump // Deep-copy lastDBDump so callers cannot mutate shared state. if m.lastDBDump != nil { copyDump := *m.lastDBDump if len(m.lastDBDump.Results) > 0 { copyDump.Results = make([]DumpResult, len(m.lastDBDump.Results)) copy(copyDump.Results, m.lastDBDump.Results) } status.LastDBDump = ©Dump } // Synthesize LastDBDump from DumpFiles on disk if not in memory if status.LastDBDump == nil && len(status.DumpFiles) > 0 { var results []DumpResult var latestTime time.Time for _, f := range status.DumpFiles { results = append(results, DumpResult{ DB: DiscoveredDB{StackName: f.StackName, DBType: f.DBType, ContainerName: f.StackName}, FilePath: f.FileName, Size: f.Size, Validation: f.Validation, }) if f.ModTime.After(latestTime) { latestTime = f.ModTime } } status.LastDBDump = &DBDumpStatus{ LastRun: latestTime, Results: results, Success: true, } } return &status } // No cache yet — return a minimal status (first page load before cache is populated) status := &FullBackupStatus{ Enabled: m.cfg.Backup.Enabled, Running: m.running, DBDumpSchedule: m.cfg.Backup.DBDumpSchedule, NextDBDump: nextDBDump, } if m.lastDBDump != nil { copyDump := *m.lastDBDump if len(m.lastDBDump.Results) > 0 { copyDump.Results = make([]DumpResult, len(m.lastDBDump.Results)) copy(copyDump.Results, m.lastDBDump.Results) } status.LastDBDump = ©Dump } return status } // isDebug returns true if logging level is "debug". func (m *Manager) isDebug() bool { return m.cfg != nil && m.cfg.Logging.Level == "debug" } func dbNames(dbs []DiscoveredDB) string { var names []string for _, db := range dbs { names = append(names, fmt.Sprintf("%s(%s)", db.ContainerName, db.DBType)) } return strings.Join(names, ", ") }