Files
felhom-controller/controller/internal/backup/backup.go
T
admin 63484a0bd4 v0.51.0: offsite-backup UI (felhom-pbs DR) + Model-A double-nest fix
- Backups page: whole-guest backup shown as real DR — target label "Biztonsági szerver –
  külön hardver (PBS)"; app-data "Távoli mentés" card now reflects the PBS offsite tier
  (guestBackupView.Offsite) instead of "nincs beállítva".
- Model-A double-nest fix: appbackup path helpers take a felhom-data NAMESPACE ROOT (no
  internal felhom-data join); backup.Manager.namespaceRoot/AppNamespaceRoot resolve
  HDD-vs-systemDataPath provenance so a drive-resident app's backups land single-nested
  (<drive>/backups/... on the guest = <drive>/felhom-data/backups/... on the host) instead
  of .../felhom-data/felhom-data/.... Writes, deletion (GetStackBackupData/RemoveStack/
  ProtectedHDDPaths), wipe-warning scan, and export updated coherently; legacy double-nest
  dirs kept protected. New appbackup test asserts no doubled segment.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-12 20:26:52 +02:00

595 lines
18 KiB
Go

package backup
import (
"context"
"fmt"
"log"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"time"
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
"gitea.dooplex.hu/admin/felhom-controller/internal/settings"
)
// Manager orchestrates app-data backups: database dumps and Docker-volume tars.
//
// Disk-tier backup (restic, cross-drive, drive-recovery, infra-backup) has been
// moved out of the controller into the host agent (slice 8C). This Manager now
// only owns the app-data domain.
type Manager struct {
cfg *config.Config
logger *log.Logger
settings *settings.Settings
stackProvider StackDataProvider
systemDataPath string // fallback drive for SSD-only apps
mu sync.Mutex
lastDBDump *DBDumpStatus
running bool
// Cached status for page rendering (refreshed periodically)
cachedStatus *FullBackupStatus
cacheTime time.Time
}
// FullBackupStatus contains everything the backup page needs.
type FullBackupStatus struct {
Enabled bool
Running bool
// DB Dumps
LastDBDump *DBDumpStatus
DumpFiles []DumpFileInfo
DiscoveredDBs []DiscoveredDB
// Schedule
DBDumpSchedule string
NextDBDump time.Time
// App data backup
AppDataInfo []AppBackupInfo
// Flash messages (set by handlers, passed through redirect)
FlashSuccess string
FlashError string
}
// DBDumpStatus holds the last DB dump result.
type DBDumpStatus struct {
LastRun time.Time
Results []DumpResult
Success bool
Duration time.Duration
}
// NewManager creates a new backup manager.
func NewManager(cfg *config.Config, sett *settings.Settings, logger *log.Logger) *Manager {
if cfg.Paths.SystemDataPath == "" {
logger.Printf("[WARN] [backup] SystemDataPath is empty in config — SSD-only apps will not have correct backup paths")
}
return &Manager{
cfg: cfg,
logger: logger,
settings: sett,
systemDataPath: cfg.Paths.SystemDataPath,
}
}
// GetAppDrivePath returns the drive path for an app.
// Uses HDD_PATH from app.yaml if set, otherwise falls back to system data path.
func (m *Manager) GetAppDrivePath(stackName string) string {
if m.stackProvider != nil {
if hddPath := m.stackProvider.GetStackHDDPath(stackName); hddPath != "" {
return hddPath
}
}
if m.systemDataPath == "" {
m.logger.Printf("[ERROR] [backup] systemDataPath is empty — cannot determine drive for %s", stackName)
}
return m.systemDataPath
}
// namespaceRoot maps an app's drive path to its felhom-data namespace ROOT (the dir that directly
// holds backups/ and appdata/). A drive-resident app's in-guest mount IS the namespace already
// (Model A, slice 10 — the agent binds <drive>/felhom-data onto the guest mountpoint), so it is used
// as-is; only the SSD-only system-data fallback gets the felhom-data subdir appended. This is what
// keeps a drive-resident app's backups single-nested instead of .../felhom-data/felhom-data/... .
func (m *Manager) namespaceRoot(drivePath string) string {
return NamespaceRoot(drivePath, drivePath != m.systemDataPath)
}
// AppNamespaceRoot returns the felhom-data namespace root for a stack's keep-side backups, resolving
// HDD-vs-system provenance internally. For callers outside this package that only know the stack
// name (e.g. the API router) so they don't double-nest the felhom-data segment.
func (m *Manager) AppNamespaceRoot(stackName string) string {
drivePath := m.GetAppDrivePath(stackName)
if drivePath == "" {
return ""
}
return m.namespaceRoot(drivePath)
}
// groupStacksByDrive groups deployed stacks by their home drive path.
func (m *Manager) groupStacksByDrive() map[string][]StackSummary {
if m.stackProvider == nil {
return nil
}
result := make(map[string][]StackSummary)
for _, stack := range m.stackProvider.ListDeployedStacks() {
drive := m.GetAppDrivePath(stack.Name)
result[drive] = append(result[drive], stack)
}
if m.isDebug() {
for drive, stacks := range result {
names := make([]string, len(stacks))
for i, s := range stacks {
names[i] = s.Name
}
m.logger.Printf("[DEBUG] groupStacksByDrive: %s → [%s]", drive, strings.Join(names, ", "))
}
}
return result
}
// RunDBDumps discovers and dumps all databases to per-drive, per-app paths.
func (m *Manager) RunDBDumps(ctx context.Context) error {
if err := m.acquireRunning(); err != nil {
return err
}
defer m.releaseRunning()
return m.runDBDumpsInternal(ctx)
}
// runDBDumpsInternal is the implementation of RunDBDumps. Caller must hold the running flag.
func (m *Manager) runDBDumpsInternal(ctx context.Context) error {
start := time.Now()
m.logger.Printf("[INFO] [backup] Starting database dump run")
dbs, err := DiscoverDatabases(ctx, m.logger, m.isDebug())
if err != nil {
m.logger.Printf("[ERROR] [backup] Database discovery failed: %v", err)
return err
}
if len(dbs) == 0 {
m.logger.Printf("[INFO] [backup] No database containers found")
m.mu.Lock()
m.lastDBDump = &DBDumpStatus{
LastRun: time.Now(),
Success: true,
Duration: time.Since(start),
}
m.mu.Unlock()
return nil
}
m.logger.Printf("[INFO] [backup] Discovered %d database(s): %s", len(dbs), dbNames(dbs))
// Dump each DB to its app's drive path
var results []DumpResult
allOK := true
var summary []string
var totalSize int64
for _, db := range dbs {
drivePath := m.GetAppDrivePath(db.StackName)
// Skip if drive is disconnected or decommissioned
if m.settings != nil && m.settings.IsDisconnected(drivePath) {
m.logger.Printf("[WARN] [backup] Skipping DB dump for %s — drive disconnected: %s", db.StackName, drivePath)
summary = append(summary, fmt.Sprintf("SKIP %s (drive disconnected)", db.ContainerName))
continue
}
if m.settings != nil && m.settings.IsDecommissioned(drivePath) {
m.logger.Printf("[WARN] [backup] Skipping DB dump for %s — drive decommissioned: %s", db.StackName, drivePath)
summary = append(summary, fmt.Sprintf("SKIP %s (drive decommissioned)", db.ContainerName))
continue
}
dumpDir := AppDBDumpPath(m.namespaceRoot(drivePath), db.StackName)
result := DumpOne(ctx, db, dumpDir, m.logger, m.isDebug())
results = append(results, result)
if result.Error != nil {
allOK = false
summary = append(summary, fmt.Sprintf("FAIL %s: %v", result.DB.ContainerName, result.Error))
m.logger.Printf("[ERROR] [backup] DB dump failed for %s: %v", result.DB.ContainerName, result.Error)
} else {
totalSize += result.Size
summary = append(summary, fmt.Sprintf("OK %s (%s)", result.DB.ContainerName, humanizeBytes(result.Size)))
// Persist validation result to settings.json
if m.settings != nil && result.FilePath != "" {
filename := filepath.Base(result.FilePath)
cache := settings.DBValidationCache{
ValidatedAt: time.Now().Format(time.RFC3339),
TableCount: result.Validation.TableCount,
HasHeader: result.Validation.Valid,
}
if !result.Validation.Valid {
cache.Error = result.Validation.Error
}
if err := m.settings.SetDBValidation(filename, cache); err != nil {
m.logger.Printf("[WARN] [backup] Failed to cache validation for %s: %v", filename, err)
}
}
}
}
duration := time.Since(start)
m.mu.Lock()
m.lastDBDump = &DBDumpStatus{
LastRun: time.Now(),
Results: results,
Success: allOK,
Duration: duration,
}
m.mu.Unlock()
if allOK {
m.logger.Printf("[INFO] [backup] DB dump completed: %d databases, %s total (%s)",
len(results), humanizeBytes(totalSize), duration.Round(time.Millisecond))
} else {
return fmt.Errorf("some database dumps failed")
}
return nil
}
// DumpAppVolumes exports Docker named volumes to tar files for the given stack.
// Tars are written to AppVolumeDumpPath(drivePath, stackName)/.
// Uses "docker run alpine tar" (same pattern as appexport).
func (m *Manager) DumpAppVolumes(stackName string) error {
if m.stackProvider == nil {
return nil
}
volumes := m.stackProvider.GetDockerVolumes(stackName)
if len(volumes) == 0 {
return nil
}
drivePath := m.GetAppDrivePath(stackName)
if drivePath == "" {
return fmt.Errorf("cannot determine drive path for %s", stackName)
}
dumpDir := AppVolumeDumpPath(m.namespaceRoot(drivePath), stackName)
if err := os.MkdirAll(dumpDir, 0755); err != nil {
return fmt.Errorf("creating volume dump dir: %w", err)
}
var dumpErrors []string
for _, volName := range volumes {
tarPath := filepath.Join(dumpDir, volName+".tar")
if m.isDebug() {
m.logger.Printf("[DEBUG] [backup] Dumping volume %s for %s", volName, stackName)
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
cmd := exec.CommandContext(ctx, "docker", "run", "--rm",
"-v", volName+":/vol:ro",
"-v", dumpDir+":/out",
"alpine", "tar", "cf", "/out/"+volName+".tar", "-C", "/vol", ".")
out, err := cmd.CombinedOutput()
cancel()
if err != nil {
m.logger.Printf("[WARN] [backup] Volume dump failed for %s/%s: %s — %v",
stackName, volName, strings.TrimSpace(string(out)), err)
os.Remove(tarPath)
dumpErrors = append(dumpErrors, volName)
continue
}
if info, _ := os.Stat(tarPath); info != nil {
m.logger.Printf("[INFO] [backup] Volume dump: %s/%s → %s", stackName, volName, humanizeBytes(info.Size()))
}
}
// Clean up tars for volumes that no longer exist
entries, _ := os.ReadDir(dumpDir)
activeVols := make(map[string]bool)
for _, v := range volumes {
activeVols[v+".tar"] = true
}
for _, e := range entries {
if !activeVols[e.Name()] && strings.HasSuffix(e.Name(), ".tar") {
os.Remove(filepath.Join(dumpDir, e.Name()))
if m.isDebug() {
m.logger.Printf("[DEBUG] [backup] Removed stale volume dump: %s/%s", stackName, e.Name())
}
}
}
if len(dumpErrors) > 0 {
return fmt.Errorf("volume dump failed for: %s", strings.Join(dumpErrors, ", "))
}
return nil
}
// DumpAppVolumesSafe stops the stack before dumping volumes and restarts after.
// Prevents inconsistent tars of live database volumes (e.g. PostgreSQL).
// Protected stacks that reject StopStack will return an error — callers handle as warning.
func (m *Manager) DumpAppVolumesSafe(stackName string) error {
if m.stackProvider == nil {
return fmt.Errorf("no stack provider")
}
m.logger.Printf("[INFO] [backup] Stopping %s for safe volume dump", stackName)
if err := m.stackProvider.StopStack(stackName); err != nil {
return fmt.Errorf("could not stop %s for volume dump: %w", stackName, err)
}
dumpErr := m.DumpAppVolumes(stackName)
m.logger.Printf("[INFO] [backup] Restarting %s after volume dump", stackName)
startErr := m.stackProvider.StartStack(stackName)
if startErr != nil {
m.logger.Printf("[ERROR] [backup] Failed to restart %s after volume dump: %v", stackName, startErr)
}
// Surface both errors — callers must know if the app is left stopped
if dumpErr != nil && startErr != nil {
return fmt.Errorf("volume dump failed for %s: %v; restart also failed: %v", stackName, dumpErr, startErr)
}
if startErr != nil {
return fmt.Errorf("volume dump OK but restart failed for %s: %w", stackName, startErr)
}
return dumpErr
}
// GetStatus returns the current DB-dump status.
func (m *Manager) GetStatus() *DBDumpStatus {
m.mu.Lock()
defer m.mu.Unlock()
return m.lastDBDump
}
// IsRunning returns whether a backup or restore is currently in progress.
func (m *Manager) IsRunning() bool {
m.mu.Lock()
defer m.mu.Unlock()
return m.running
}
// acquireRunning atomically sets the running flag. Returns error if already running.
func (m *Manager) acquireRunning() error {
m.mu.Lock()
defer m.mu.Unlock()
if m.running {
return fmt.Errorf("backup already in progress")
}
m.running = true
return nil
}
// releaseRunning clears the running flag.
func (m *Manager) releaseRunning() {
m.mu.Lock()
m.running = false
m.mu.Unlock()
}
// SetStackProvider sets the stack data provider for app data discovery.
// Write is protected by mutex since stackProvider is read by concurrent goroutines.
func (m *Manager) SetStackProvider(provider StackDataProvider) {
m.mu.Lock()
m.stackProvider = provider
m.mu.Unlock()
}
// GetStackHDDMounts returns HDD mount paths for the named stack via the stack provider.
func (m *Manager) GetStackHDDMounts(name string) []string {
if m.stackProvider == nil {
return nil
}
return m.stackProvider.GetStackHDDMounts(name)
}
// DumpStackDB runs a database dump for containers belonging to a specific stack.
// Dumps to the stack's home drive: <drive>/backups/primary/<stack>/db-dumps/.
func (m *Manager) DumpStackDB(ctx context.Context, stackName string) error {
dbs, err := DiscoverDatabases(ctx, m.logger, m.isDebug())
if err != nil {
return fmt.Errorf("database discovery failed: %w", err)
}
var stackDBs []DiscoveredDB
for _, db := range dbs {
if db.StackName == stackName {
stackDBs = append(stackDBs, db)
}
}
if len(stackDBs) == 0 {
m.logger.Printf("[DEBUG] No databases found for stack %s — skipping pre-backup dump", stackName)
return nil
}
drivePath := m.GetAppDrivePath(stackName)
if drivePath == "" || !filepath.IsAbs(drivePath) {
return fmt.Errorf("cannot determine absolute drive path for %s (systemDataPath not configured?)", stackName)
}
dumpDir := AppDBDumpPath(m.namespaceRoot(drivePath), stackName)
m.logger.Printf("[INFO] [backup] Running pre-backup DB dump for %s (%d database(s)) → %s", stackName, len(stackDBs), dumpDir)
for _, db := range stackDBs {
result := DumpOne(ctx, db, dumpDir, m.logger, m.isDebug())
if result.Error != nil {
return fmt.Errorf("DB dump failed for %s: %w", result.DB.ContainerName, result.Error)
}
m.logger.Printf("[INFO] [backup] Pre-backup DB dump OK: %s (%s)", result.DB.ContainerName, humanizeBytes(result.Size))
// Persist validation to settings
if m.settings != nil && result.FilePath != "" {
filename := filepath.Base(result.FilePath)
cache := settings.DBValidationCache{
ValidatedAt: time.Now().Format(time.RFC3339),
TableCount: result.Validation.TableCount,
HasHeader: result.Validation.Valid,
}
if !result.Validation.Valid {
cache.Error = result.Validation.Error
}
_ = m.settings.SetDBValidation(filename, cache)
}
}
return nil
}
// listAllDumpFiles scans per-drive per-stack DB dump directories.
func (m *Manager) listAllDumpFiles() []DumpFileInfo {
var allFiles []DumpFileInfo
for drive, stacks := range m.groupStacksByDrive() {
for _, stack := range stacks {
dumpDir := AppDBDumpPath(m.namespaceRoot(drive), stack.Name)
if files, err := ListDumpFiles(dumpDir); err == nil {
allFiles = append(allFiles, files...)
}
}
}
m.logger.Printf("[INFO] [backup] Found %d DB dump files across drives", len(allFiles))
return allFiles
}
// RefreshCache updates the cached full status. Called by scheduler every 5 minutes.
func (m *Manager) RefreshCache(nextDBDump time.Time) {
status := &FullBackupStatus{
Enabled: m.cfg.Backup.Enabled,
DBDumpSchedule: m.cfg.Backup.DBDumpSchedule,
NextDBDump: nextDBDump,
}
// Scan dump files from per-drive per-stack paths
files := m.listAllDumpFiles()
status.DumpFiles = files
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if dbs, err := DiscoverDatabases(ctx, m.logger, m.isDebug()); err == nil {
status.DiscoveredDBs = dbs
}
// Discover app data — all deployed stacks, backup is mandatory
if m.stackProvider != nil {
status.AppDataInfo = DiscoverAppData(m.stackProvider, status.DiscoveredDBs)
}
// Fill in dynamic fields under lock.
m.mu.Lock()
status.Running = m.running
status.LastDBDump = m.lastDBDump
// Cross-check lastDBDump results inside lock to prevent torn writes.
if m.lastDBDump != nil && len(files) > 0 {
fileValidation := make(map[string]DumpValidation) // keyed by filename
for _, f := range files {
fileValidation[f.FileName] = f.Validation
}
for i, r := range m.lastDBDump.Results {
if !r.Validation.Valid && r.Validation.Error == "" && r.FilePath != "" {
filename := filepath.Base(r.FilePath)
if fv, ok := fileValidation[filename]; ok {
m.lastDBDump.Results[i].Validation = fv
m.logger.Printf("[INFO] [backup] Re-validated %s from disk: valid=%v tables=%d",
filename, fv.Valid, fv.TableCount)
}
}
}
}
m.cachedStatus = status
m.cacheTime = time.Now()
m.mu.Unlock()
m.logger.Printf("[INFO] [backup] Backup status cache refreshed")
}
// GetFullStatus returns the cached backup status for page rendering.
// Returns instantly — no subprocess calls.
// Returns a deep copy so callers can safely append to slice fields without
// polluting the cache.
func (m *Manager) GetFullStatus(nextDBDump time.Time) *FullBackupStatus {
m.mu.Lock()
defer m.mu.Unlock()
if m.cachedStatus != nil {
status := *m.cachedStatus
status.AppDataInfo = make([]AppBackupInfo, len(m.cachedStatus.AppDataInfo))
copy(status.AppDataInfo, m.cachedStatus.AppDataInfo)
// Update dynamic fields that don't need subprocess calls
status.Running = m.running
status.NextDBDump = nextDBDump
// Deep-copy lastDBDump so callers cannot mutate shared state.
if m.lastDBDump != nil {
copyDump := *m.lastDBDump
if len(m.lastDBDump.Results) > 0 {
copyDump.Results = make([]DumpResult, len(m.lastDBDump.Results))
copy(copyDump.Results, m.lastDBDump.Results)
}
status.LastDBDump = &copyDump
}
// Synthesize LastDBDump from DumpFiles on disk if not in memory
if status.LastDBDump == nil && len(status.DumpFiles) > 0 {
var results []DumpResult
var latestTime time.Time
for _, f := range status.DumpFiles {
results = append(results, DumpResult{
DB: DiscoveredDB{StackName: f.StackName, DBType: f.DBType, ContainerName: f.StackName},
FilePath: f.FileName,
Size: f.Size,
Validation: f.Validation,
})
if f.ModTime.After(latestTime) {
latestTime = f.ModTime
}
}
status.LastDBDump = &DBDumpStatus{
LastRun: latestTime,
Results: results,
Success: true,
}
}
return &status
}
// No cache yet — return a minimal status (first page load before cache is populated)
status := &FullBackupStatus{
Enabled: m.cfg.Backup.Enabled,
Running: m.running,
DBDumpSchedule: m.cfg.Backup.DBDumpSchedule,
NextDBDump: nextDBDump,
}
if m.lastDBDump != nil {
copyDump := *m.lastDBDump
if len(m.lastDBDump.Results) > 0 {
copyDump.Results = make([]DumpResult, len(m.lastDBDump.Results))
copy(copyDump.Results, m.lastDBDump.Results)
}
status.LastDBDump = &copyDump
}
return status
}
// isDebug returns true if logging level is "debug".
func (m *Manager) isDebug() bool {
return m.cfg.Logging.Level == "debug"
}
func dbNames(dbs []DiscoveredDB) string {
var names []string
for _, db := range dbs {
names = append(names, fmt.Sprintf("%s(%s)", db.ContainerName, db.DBType))
}
return strings.Join(names, ", ")
}