From fb11c3b75a2e42033c66e4cd7b718601784342f9 Mon Sep 17 00:00:00 2001 From: kisfenyo Date: Sat, 28 Feb 2026 08:56:48 +0100 Subject: [PATCH] =?UTF-8?q?feat:=20backup=20safety=20=E2=80=94=20stop-befo?= =?UTF-8?q?re-dump,=20streaming=20restore,=20health=20check,=20per-app=20r?= =?UTF-8?q?estic,=20infra=20configs=20(v0.34.0)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 --- CHANGELOG.md | 13 +++++ controller/README.md | 16 +++--- controller/cmd/controller/main.go | 9 ++++ controller/internal/backup/appdata.go | 1 + controller/internal/backup/backup.go | 62 ++++++++++++++++------ controller/internal/backup/crossdrive.go | 3 +- controller/internal/backup/restore.go | 41 +++++++++++++- controller/internal/report/infra_backup.go | 35 +++++++++--- 8 files changed, 147 insertions(+), 33 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 60d41e7..16ef755 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ ## Changelog +### v0.34.0 — Backup safety: stop-before-dump, streaming restore, health check, per-app restic, infra configs (2026-02-28) + +#### Changed +- **backup/backup.go**: `DumpAppVolumesSafe()` stops stack before volume dump, restarts after — prevents inconsistent tars of live database volumes (PostgreSQL, MariaDB, SQLite) +- **backup/backup.go**: `backupDrive()` includes per-app stack config dirs instead of full StacksDir; `controller.yaml` only on system drive — reduces snapshot duplication across drives +- **backup/crossdrive.go**: `VolumeDumper` interface extended with `DumpAppVolumesSafe()`; cross-drive backup uses safe variant for pre-backup volume dumps +- **backup/restore.go**: Tier 2 DB dump copy uses streaming `copyFile()` (io.Copy + atomic rename) instead of `os.ReadFile`/`os.WriteFile` — eliminates full-file memory allocation for large dumps +- **backup/restore.go**: Post-restore health check via `waitForHealthy()` polls container state (with docker ps refresh) for up to 90s after restore + +#### Added +- **backup/appdata.go**: `RefreshAndIsRunning()` on `StackDataProvider` interface for reliable post-restore state checks (forces docker ps refresh before reading state) +- **report/infra_backup.go**: `InfraStack` now includes `DockerComposeB64`, `AppYamlB64`, `FelhomYamlB64` — actual stack config files for disaster recovery (derived from `GetStackComposePath`, no signature change) + ### v0.33.0 — Docker volume backup + Tier 2 restore + restore dropdown fixes (2026-02-27) #### Added diff --git a/controller/README.md b/controller/README.md index 24d965d..79321f9 100644 --- a/controller/README.md +++ b/controller/README.md @@ -336,6 +336,7 @@ Path computation is centralized in `backup/paths.go` via the `FelhomDataDir = "f **Phase 1b — Docker Volume Dumps** (`internal/backup/backup.go`, runs after DB dumps) - Iterates all deployed stacks that have Docker named volumes (`GetDockerVolumes()`) +- **v0.34.0:** Each stack is stopped before dump, restarted after (`DumpAppVolumesSafe()`) — prevents inconsistent tars of live databases. Protected stacks (traefik, etc.) that reject StopStack are skipped with a warning. - For each volume: `docker run --rm -v :/vol:ro -v :/out alpine tar cf /out/.tar -C /vol .` - 10-minute timeout per volume; warnings on failure (non-fatal) - Stale tars cleaned up (volumes that no longer exist) @@ -347,12 +348,12 @@ Path computation is centralized in `backup/paths.go` via the `FelhomDataDir = "f - Apps are **grouped by drive** via `groupStacksByDrive()` — each drive's apps are backed up to that drive's restic repo - App drive resolution: `GetStackHDDPath()` (from `StackDataProvider`) → falls back to `SystemDataPath` - Auto-generated repository password (32 random bytes, base64url), shared across all repos, synced to hub -- **Paths included in every per-drive snapshot:** +- **Paths included in each per-drive snapshot (v0.34.0: per-app scoped):** - Per-app DB dump dirs on that drive - Per-app Docker volume dump dirs (`volume-dumps/*.tar`) - Per-app HDD mount paths (user data) - - Stacks dir (compose.yml + app.yaml + .felhom.yml for all apps) - - `controller.yaml` (controller config) + - Per-app stack config dir (`//` — only for stacks on this drive) + - `controller.yaml` — only on the system drive (not duplicated across all drives) - Auto-detects and unlocks stale locks (restic repo lock) - Weekly prune on Sundays with configurable retention (keep-daily, keep-weekly, keep-monthly) - Weekly integrity check (`restic check`) on Sunday 04:00 — checks **all** primary repos @@ -377,7 +378,7 @@ data back up config + DB + user data + Docker volumes; apps without HDD back up - **restic** — Versioned, deduplicated, encrypted (shared repo across apps, not browsable) - Per-app configuration in settings.json: destination path, method, schedule (daily/weekly/manual) - **Pre-backup DB dump:** `DumpStackDB()` runs fresh pg_dump/mariadb-dump before each cross-drive backup; non-fatal on failure (wired via `DBDumper` interface to avoid circular imports) -- **Pre-backup volume dump (v0.33.0):** `DumpAppVolumes()` exports Docker named volumes to tar before each cross-drive backup (wired via `VolumeDumper` interface) +- **Pre-backup volume dump (v0.33.0, safe stop/start v0.34.0):** `DumpAppVolumesSafe()` stops the stack, exports Docker named volumes to tar, restarts — wired via `VolumeDumper` interface - **Empty mounts allowed:** `RunAppBackup` accepts apps with no HDD mounts — the rsync mount loop simply doesn't execute, but DB + config copy still runs - **Drive-type-aware validation** (`ValidateDestination`): @@ -440,16 +441,17 @@ appear in the restore dropdown with per-app snapshot filtering. - Config only: "Csak konfiguracio visszaallitasa" **Tier 1 restore** (`RestoreApp`): -- Stop app → resolve app's home drive → `restic restore --target / --include ...` → populate Docker volumes from restored tars → restart app +- Stop app → resolve app's home drive → `restic restore --target / --include ...` → populate Docker volumes from restored tars → restart app → health check - Restore paths: config dir, DB dump dir, volume dump dir, HDD mounts - Docker volumes restored via `restoreDockerVolumes()`: `docker volume rm -f` → `docker volume create` → `docker run alpine tar xf` **Tier 2 restore** (`RestoreAppFromTier2`): -- Stop app → rsync config from `_config/` → rsync HDD data (single/multi-mount) → copy DB dumps from `_db/` → restore Docker volumes from `_volumes/` tars → restart app +- Stop app → rsync config from `_config/` → rsync HDD data (single/multi-mount) → copy DB dumps from `_db/` (streaming `copyFile`) → restore Docker volumes from `_volumes/` tars → restart app → health check - Uses rsync `--delete` for config and HDD data to ensure exact mirror state - Single-mount apps: data directly in rsync dir (excluding `_*`); multi-mount: per-leaf subdirectories **Common:** +- **v0.34.0:** Post-restore health check (`waitForHealthy`) polls container state with `docker ps` refresh every 5s for up to 90s. Warning logged if app doesn't reach running state; restore still returns success (data is restored regardless). - Running flag prevents concurrent backup/restore operations - Snapshot ID validated (8-64 lowercase hex, or special `tier2-rsync`) - Import from `.fab` bundle link shown in restore section for cross-system migration @@ -970,7 +972,7 @@ After each backup cycle (including manual Tier 2 triggers via `OnCrossDriveCompl - `controller.yaml` (base64-encoded, full config including secrets) - `settings.json` (base64-encoded, backup prefs, storage paths, cross-drive configs) - Disk layout (UUIDs, labels, mount points, fstab options, bind-mount topology) -- Deployed stacks manifest (app names, HDD paths) +- Deployed stacks manifest (app names, HDD paths) with actual config files: `docker-compose.yml`, `app.yaml`, `.felhom.yml` (base64-encoded per stack, v0.34.0) - Restic passwords (primary + cross-drive, base64-encoded) This enables fully automated recovery when the system drive is replaced — the new controller pulls the snapshot from the Hub, auto-mounts surviving drives by UUID, and restores all applications. diff --git a/controller/cmd/controller/main.go b/controller/cmd/controller/main.go index ea73549..6432c14 100644 --- a/controller/cmd/controller/main.go +++ b/controller/cmd/controller/main.go @@ -955,6 +955,15 @@ func (a *stackAdapter) GetStackHDDPath(name string) string { return "" } +// RefreshAndIsRunning forces a docker ps scan before checking state. +// Called during post-restore health check (~every 5s for up to 90s). +// Full refresh is acceptable here since restores are rare operations. +func (a *stackAdapter) RefreshAndIsRunning(name string) bool { + a.mgr.RefreshStatus() + s, ok := a.mgr.GetStack(name) + return ok && s.State == stacks.StateRunning +} + // integrationStackAdapter implements integrations.StackProvider using stacks.Manager. type integrationStackAdapter struct { mgr *stacks.Manager diff --git a/controller/internal/backup/appdata.go b/controller/internal/backup/appdata.go index aee95a7..24bb590 100644 --- a/controller/internal/backup/appdata.go +++ b/controller/internal/backup/appdata.go @@ -22,6 +22,7 @@ type StackDataProvider interface { GetDockerVolumes(name string) []string // full Docker volume names (project-prefixed) StopStack(name string) error StartStack(name string) error + RefreshAndIsRunning(name string) bool } // StackSummary holds minimal stack info needed for app data discovery. diff --git a/controller/internal/backup/backup.go b/controller/internal/backup/backup.go index 9b38891..b7040ee 100644 --- a/controller/internal/backup/backup.go +++ b/controller/internal/backup/backup.go @@ -366,12 +366,6 @@ func (m *Manager) runBackupInternal(ctx context.Context) error { return nil } - // Infrastructure paths included in every drive's primary repo - infraPaths := []string{ - m.cfg.Paths.StacksDir, - "/opt/docker/felhom-controller/controller.yaml", - } - var lastResult *SnapshotResult var anyErr error driveCount := 0 @@ -380,7 +374,7 @@ func (m *Manager) runBackupInternal(ctx context.Context) error { if m.isDebug() { m.logger.Printf("[DEBUG] runBackupInternal: processing drive %s (%d stacks)", drivePath, len(stacks)) } - result, err := m.backupDrive(ctx, drivePath, stacks, infraPaths) + result, err := m.backupDrive(ctx, drivePath, stacks) if err != nil { anyErr = err continue @@ -452,7 +446,7 @@ func (m *Manager) runBackupInternal(ctx context.Context) error { // backupDrive runs restic backup for a single drive. Returns nil result if skipped. // Caller must hold the running flag. -func (m *Manager) backupDrive(ctx context.Context, drivePath string, stacks []StackSummary, infraPaths []string) (*SnapshotResult, error) { +func (m *Manager) backupDrive(ctx context.Context, drivePath string, stacks []StackSummary) (*SnapshotResult, error) { // Skip disconnected or decommissioned drives if m.settings != nil && m.settings.IsDisconnected(drivePath) { m.logger.Printf("[WARN] [backup] Skipping backup for drive %s — disconnected", drivePath) @@ -473,7 +467,11 @@ func (m *Manager) backupDrive(ctx context.Context, drivePath string, stacks []St // Build paths for this drive var paths []string - paths = append(paths, infraPaths...) + + // Include controller.yaml only on the system drive + if drivePath == m.systemDataPath { + paths = append(paths, "/opt/docker/felhom-controller/controller.yaml") + } for _, stack := range stacks { // App data (appdata//) @@ -499,6 +497,11 @@ func (m *Manager) backupDrive(ctx context.Context, drivePath string, stacks []St if _, err := os.Stat(volDumpDir); err == nil { paths = append(paths, volDumpDir) } + // Stack config dir (docker-compose.yml, app.yaml, .felhom.yml) + stackDir := filepath.Join(m.cfg.Paths.StacksDir, stack.Name) + if _, err := os.Stat(stackDir); err == nil { + paths = append(paths, stackDir) + } } // Deduplicate paths @@ -558,12 +561,7 @@ func (m *Manager) TryRunDriveBackup(ctx context.Context, drivePath string) error return nil } - infraPaths := []string{ - m.cfg.Paths.StacksDir, - "/opt/docker/felhom-controller/controller.yaml", - } - - result, err := m.backupDrive(ctx, drivePath, stacks, infraPaths) + result, err := m.backupDrive(ctx, drivePath, stacks) if err != nil { return err } @@ -702,7 +700,39 @@ func (m *Manager) DumpAppVolumes(stackName string) error { return nil } +// DumpAppVolumesSafe stops the stack before dumping volumes and restarts after. +// Prevents inconsistent tars of live database volumes (e.g. PostgreSQL). +// Protected stacks that reject StopStack will return an error — callers handle as warning. +func (m *Manager) DumpAppVolumesSafe(stackName string) error { + if m.stackProvider == nil { + return fmt.Errorf("no stack provider") + } + + m.logger.Printf("[INFO] [backup] Stopping %s for safe volume dump", stackName) + if err := m.stackProvider.StopStack(stackName); err != nil { + return fmt.Errorf("could not stop %s for volume dump: %w", stackName, err) + } + + dumpErr := m.DumpAppVolumes(stackName) + + m.logger.Printf("[INFO] [backup] Restarting %s after volume dump", stackName) + startErr := m.stackProvider.StartStack(stackName) + if startErr != nil { + m.logger.Printf("[ERROR] [backup] Failed to restart %s after volume dump: %v", stackName, startErr) + } + + // Surface both errors — callers must know if the app is left stopped + if dumpErr != nil && startErr != nil { + return fmt.Errorf("volume dump failed for %s: %v; restart also failed: %v", stackName, dumpErr, startErr) + } + if startErr != nil { + return fmt.Errorf("volume dump OK but restart failed for %s: %w", stackName, startErr) + } + return dumpErr +} + // runVolumeDumpsInternal dumps Docker named volumes for all deployed apps. +// Stops each stack before dumping for data consistency, restarts after. func (m *Manager) runVolumeDumpsInternal(ctx context.Context) error { if m.stackProvider == nil { return nil @@ -717,7 +747,7 @@ func (m *Manager) runVolumeDumpsInternal(ctx context.Context) error { if ctx.Err() != nil { return ctx.Err() } - if err := m.DumpAppVolumes(stack.Name); err != nil { + if err := m.DumpAppVolumesSafe(stack.Name); err != nil { m.logger.Printf("[WARN] [backup] Volume dump error for %s: %v", stack.Name, err) failed++ } else { diff --git a/controller/internal/backup/crossdrive.go b/controller/internal/backup/crossdrive.go index b1241a3..a346555 100644 --- a/controller/internal/backup/crossdrive.go +++ b/controller/internal/backup/crossdrive.go @@ -25,6 +25,7 @@ type DBDumper interface { // VolumeDumper can dump Docker named volumes for a specific stack. type VolumeDumper interface { DumpAppVolumes(stackName string) error + DumpAppVolumesSafe(stackName string) error // stops stack before dump, restarts after } // CrossDriveRunner handles per-app backup to secondary storage. @@ -144,7 +145,7 @@ func (r *CrossDriveRunner) RunAppBackup(ctx context.Context, stackName string) e if r.debug { r.logger.Printf("[DEBUG] RunAppBackup: triggering pre-backup volume dump for %s", stackName) } - if err := r.volDumper.DumpAppVolumes(stackName); err != nil { + if err := r.volDumper.DumpAppVolumesSafe(stackName); err != nil { r.logger.Printf("[WARN] [backup] Pre-backup volume dump failed for %s: %v — proceeding with backup", stackName, err) } } diff --git a/controller/internal/backup/restore.go b/controller/internal/backup/restore.go index 244a51d..78b3238 100644 --- a/controller/internal/backup/restore.go +++ b/controller/internal/backup/restore.go @@ -138,6 +138,11 @@ func (m *Manager) RestoreApp(stackName, snapshotID string) error { m.logger.Printf("[WARN] RESTORE could not restart %s after restore: %v", stackName, err) } + // Verify app started successfully + if err := m.waitForHealthy(stackName, 90*time.Second); err != nil { + m.logger.Printf("[WARN] [backup] Restore completed but app health check failed: %v", err) + } + hasVolumes := len(m.stackProvider.GetDockerVolumes(stackName)) > 0 restoreType := "config+DB" if hasHDD || hasVolumes { @@ -267,8 +272,8 @@ func (m *Manager) RestoreAppFromTier2(stackName string) error { if !e.IsDir() { src := filepath.Join(dbSrc, e.Name()) dst := filepath.Join(dbDst, e.Name()) - if data, err := os.ReadFile(src); err == nil { - os.WriteFile(dst, data, 0644) + if err := copyFile(src, dst); err != nil { + m.logger.Printf("[WARN] [backup] Failed to copy DB dump %s: %v", e.Name(), err) } } } @@ -291,6 +296,11 @@ func (m *Manager) RestoreAppFromTier2(stackName string) error { m.logger.Printf("[WARN] RESTORE could not restart %s after Tier 2 restore: %v", stackName, err) } + // Verify app started successfully + if err := m.waitForHealthy(stackName, 90*time.Second); err != nil { + m.logger.Printf("[WARN] [backup] Tier 2 restore completed but app health check failed: %v", err) + } + hasVolumes := len(m.stackProvider.GetDockerVolumes(stackName)) > 0 restoreType := "config+DB" if hasHDD || hasVolumes { @@ -403,3 +413,30 @@ func (m *Manager) restoreDockerVolumes(stackName, drivePath string) error { } return nil } + +// waitForHealthy waits for a stack to reach running state after restore. +// Forces a docker ps refresh on each poll to avoid stale state. +// Acceptable overhead for a rare operation (restore). +func (m *Manager) waitForHealthy(stackName string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + interval := 5 * time.Second + + time.Sleep(3 * time.Second) // initial settling time + + for time.Now().Before(deadline) { + if m.stackProvider == nil { + return fmt.Errorf("no stack provider") + } + if m.stackProvider.RefreshAndIsRunning(stackName) { + if m.isDebug() { + m.logger.Printf("[DEBUG] [backup] Post-restore health check: %s is running", stackName) + } + return nil + } + if m.isDebug() { + m.logger.Printf("[DEBUG] [backup] Post-restore health check: %s not yet running, waiting...", stackName) + } + time.Sleep(interval) + } + return fmt.Errorf("stack %s did not reach running state within %s after restore", stackName, timeout) +} diff --git a/controller/internal/report/infra_backup.go b/controller/internal/report/infra_backup.go index 4066d52..dfd350c 100644 --- a/controller/internal/report/infra_backup.go +++ b/controller/internal/report/infra_backup.go @@ -5,6 +5,7 @@ import ( "fmt" "log" "os" + "path/filepath" "time" "gitea.dooplex.hu/admin/felhom-controller/internal/backup" @@ -30,11 +31,18 @@ type InfraBackup struct { } // InfraStack identifies a deployed app for disaster recovery. +// Note: AppYamlB64 contains encrypted secrets (ENC:... values). +// The encryption key is also in this backup (EncryptionKeyB64). +// This is intentional — the infra backup must be self-contained for DR. +// Physical security of the backup media protects both. type InfraStack struct { - Name string `json:"name"` - DisplayName string `json:"display_name"` - HDDPath string `json:"hdd_path,omitempty"` - NeedsHDD bool `json:"needs_hdd"` + Name string `json:"name"` + DisplayName string `json:"display_name"` + HDDPath string `json:"hdd_path,omitempty"` + NeedsHDD bool `json:"needs_hdd"` + DockerComposeB64 string `json:"docker_compose_b64,omitempty"` + AppYamlB64 string `json:"app_yaml_b64,omitempty"` + FelhomYamlB64 string `json:"felhom_yaml_b64,omitempty"` } // BuildInfraBackup collects all infrastructure state for Hub backup. @@ -89,15 +97,28 @@ func BuildInfraBackup( // Collect disk layout from fstab + blkid ib.DiskLayout = collectDiskLayout(systemDataPath) - // Collect deployed stacks + // Collect deployed stacks (including actual config files for DR) deployed := stackProvider.ListDeployedStacks() for _, s := range deployed { - ib.DeployedStacks = append(ib.DeployedStacks, InfraStack{ + is := InfraStack{ Name: s.Name, DisplayName: s.DisplayName, HDDPath: stackProvider.GetStackHDDPath(s.Name), NeedsHDD: s.NeedsHDD, - }) + } + if composePath, ok := stackProvider.GetStackComposePath(s.Name); ok { + stackDir := filepath.Dir(composePath) + if data, err := os.ReadFile(filepath.Join(stackDir, "docker-compose.yml")); err == nil { + is.DockerComposeB64 = base64.StdEncoding.EncodeToString(data) + } + if data, err := os.ReadFile(filepath.Join(stackDir, "app.yaml")); err == nil { + is.AppYamlB64 = base64.StdEncoding.EncodeToString(data) + } + if data, err := os.ReadFile(filepath.Join(stackDir, ".felhom.yml")); err == nil { + is.FelhomYamlB64 = base64.StdEncoding.EncodeToString(data) + } + } + ib.DeployedStacks = append(ib.DeployedStacks, is) } if ib.DeployedStacks == nil { ib.DeployedStacks = []InfraStack{}