From eefeeabea394f1b0c1abe34c183484c6ab3f27dc Mon Sep 17 00:00:00 2001 From: kisfenyo Date: Sat, 13 Jun 2026 10:27:35 +0200 Subject: [PATCH] v0.53.1: refresh recovery units on periodic cache cycle (idempotent) CaptureRecoveryUnit now builds content in memory and skips writes when the unit is already current (checksum + dump-set + version), so it can run from RefreshCache (startup + every 5m) without thrashing the USB drive. Units now exist shortly after startup and track config changes without waiting for the daily DB dump. +idempotency test. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 9 ++ controller/internal/backup/backup.go | 4 + controller/internal/backup/recovery_unit.go | 135 ++++++++++++------ .../internal/backup/recovery_unit_test.go | 46 ++++++ 4 files changed, 151 insertions(+), 43 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c63101..50bf84c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ ## Changelog +### v0.53.1 — Phase 2: recovery units refresh on the periodic cache cycle (idempotent) (2026-06-13) + +The recovery-unit capture now also runs from `RefreshCache` (controller startup + every 5m), not only +the daily DB dump — so a unit exists shortly after startup and stays current with config changes +(redeploy / optional-config) without a 24h wait. `CaptureRecoveryUnit` builds the captured content in +memory and **skips all writes when the unit is already current** (same config checksums + dump set + +controller version), so the periodic refresh does not thrash a spinning USB drive. Added an idempotency +test (unchanged → skip; config change → rewrite). + ### v0.53.0 — Phase 2: per-app self-contained recovery unit (capture side, SECRET-FREE) (2026-06-13) Each app's on-drive backup becomes a complete, recreatable **recovery unit** — not just DB dumps + diff --git a/controller/internal/backup/backup.go b/controller/internal/backup/backup.go index 96dd707..6f58c03 100644 --- a/controller/internal/backup/backup.go +++ b/controller/internal/backup/backup.go @@ -487,6 +487,10 @@ func (m *Manager) RefreshCache(nextDBDump time.Time) { // Discover app data — all deployed stacks, backup is mandatory if m.stackProvider != nil { status.AppDataInfo = DiscoverAppData(m.stackProvider, status.DiscoveredDBs) + // Phase 2: keep each app's recovery unit current with its definition. Idempotent + // (checksum-skip), so this periodic refresh only writes when the config actually changed, + // and ensures units exist shortly after startup without waiting for the daily DB dump. + m.captureAllRecoveryUnits() } // Fill in dynamic fields under lock. diff --git a/controller/internal/backup/recovery_unit.go b/controller/internal/backup/recovery_unit.go index 0fd772c..572ee0f 100644 --- a/controller/internal/backup/recovery_unit.go +++ b/controller/internal/backup/recovery_unit.go @@ -56,6 +56,10 @@ func (m *Manager) SetVersion(v string) { // CaptureRecoveryUnit writes/refreshes an app's secret-free recovery unit: it captures the // compose + metadata + a secret-stripped app.yaml into compose/, enumerates the DB/volume dumps // already present, and writes manifest.json. It NEVER writes a secret value or the Docker image. +// +// Idempotent: it builds the captured content in memory first and SKIPS all writes when the unit is +// already current (same config checksums, same dump set, same controller version) — so it can run on +// the periodic status refresh without thrashing a spinning USB drive. func (m *Manager) CaptureRecoveryUnit(stackName string) error { if m.stackProvider == nil { return fmt.Errorf("no stack provider") @@ -69,41 +73,60 @@ func (m *Manager) CaptureRecoveryUnit(stackName string) error { return fmt.Errorf("cannot determine absolute drive path for %s", stackName) } nsRoot := m.namespaceRoot(drivePath) + + // Build the captured config CONTENT in memory (no writes yet) so we can checksum-compare. + type capFile struct { + name string + data []byte + perm os.FileMode + } + var files []capFile + checksums := make(map[string]string) + var configFiles []string + for _, fname := range []string{"docker-compose.yml", ".felhom.yml"} { + data, err := os.ReadFile(filepath.Join(info.StackDir, fname)) + if err != nil { + continue // optional — capture whichever exist + } + files = append(files, capFile{fname, data, 0644}) + checksums[fname] = sha256Hex(data) + configFiles = append(configFiles, fname) + } + appYaml := buildStrippedAppYaml(info) + files = append(files, capFile{"app.yaml", appYaml, 0600}) + checksums["app.yaml"] = sha256Hex(appYaml) + configFiles = append(configFiles, "app.yaml") + + dbDumps := listFileNames(AppDBDumpPath(nsRoot, stackName), ".sql") + volDumps := listFileNames(AppVolumeDumpPath(nsRoot, stackName), ".tar") + version := m.versionLocked() + + manifestPath := RecoveryUnitManifestPath(nsRoot, stackName) + + // Skip if the unit is already current — avoids needless drive writes on the periodic refresh. + if cur := readManifest(manifestPath); cur != nil && + cur.ControllerVer == version && + stringMapEqual(cur.Checksums, checksums) && + stringSliceEqual(cur.DBDumps, dbDumps) && + stringSliceEqual(cur.VolumeDumps, volDumps) { + return nil + } + composeDir := RecoveryUnitComposePath(nsRoot, stackName) if err := os.MkdirAll(composeDir, 0755); err != nil { return fmt.Errorf("creating recovery-unit compose dir: %w", err) } - - checksums := make(map[string]string) - var configFiles []string - - // Capture docker-compose.yml + .felhom.yml verbatim (whichever exist). - for _, fname := range []string{"docker-compose.yml", ".felhom.yml"} { - src := filepath.Join(info.StackDir, fname) - if _, err := os.Stat(src); err != nil { - continue + for _, f := range files { + if err := atomicWrite(filepath.Join(composeDir, f.name), f.data, f.perm); err != nil { + return fmt.Errorf("capturing %s: %w", f.name, err) } - sum, err := copyFileChecksum(src, filepath.Join(composeDir, fname)) - if err != nil { - return fmt.Errorf("capturing %s: %w", fname, err) - } - checksums[fname] = sum - configFiles = append(configFiles, fname) } - // Write the SECRET-STRIPPED app.yaml (non-secret env only). - sum, err := writeStrippedAppYaml(filepath.Join(composeDir, "app.yaml"), info) - if err != nil { - return fmt.Errorf("writing stripped app.yaml: %w", err) - } - checksums["app.yaml"] = sum - configFiles = append(configFiles, "app.yaml") - manifest := &RecoveryManifest{ SchemaVersion: 1, AppName: stackName, DisplayName: info.DisplayName, - ControllerVer: m.versionLocked(), + ControllerVer: version, CreatedAt: time.Now().UTC().Format(time.RFC3339), Drive: drivePath, NamespaceRoot: nsRoot, @@ -112,11 +135,11 @@ func (m *Manager) CaptureRecoveryUnit(stackName string) error { DataKeyEnvVars: info.DataKeyEnvVars, SecretSource: "guest app.yaml (live rootfs) or PBS whole-guest snapshot — never stored in this unit", ConfigFiles: configFiles, - DBDumps: listFileNames(AppDBDumpPath(nsRoot, stackName), ".sql"), - VolumeDumps: listFileNames(AppVolumeDumpPath(nsRoot, stackName), ".tar"), + DBDumps: dbDumps, + VolumeDumps: volDumps, Checksums: checksums, } - if err := writeManifest(RecoveryUnitManifestPath(nsRoot, stackName), manifest); err != nil { + if err := writeManifest(manifestPath, manifest); err != nil { return fmt.Errorf("writing manifest: %w", err) } @@ -154,11 +177,13 @@ type strippedAppYaml struct { Env map[string]string `yaml:"env"` } -// writeStrippedAppYaml writes a secret-free app.yaml (non-secret env only) and returns its sha256. -func writeStrippedAppYaml(dst string, info RecoveryInfo) (string, error) { +// buildStrippedAppYaml renders a secret-free app.yaml (non-secret env only) as bytes. Deterministic: +// yaml.v3 sorts map keys and the secret-name list comes in stable metadata order, so identical input +// yields identical bytes (needed for the checksum-skip guard). +func buildStrippedAppYaml(info RecoveryInfo) []byte { body, err := yaml.Marshal(strippedAppYaml{Deployed: true, Env: info.NonSecretEnv}) if err != nil { - return "", err + body = []byte("deployed: true\nenv: {}\n") } header := "# Captured by felhom-controller recovery unit — SECRET-FREE.\n" + "# Secret/data-key values are intentionally omitted; recover them at restore from the\n" + @@ -166,12 +191,7 @@ func writeStrippedAppYaml(dst string, info RecoveryInfo) (string, error) { if len(info.SecretEnvVars) > 0 { header += "# " + strings.Join(info.SecretEnvVars, ", ") + "\n" } - content := []byte(header + string(body)) - if err := atomicWrite(dst, content, 0600); err != nil { - return "", err - } - sum := sha256.Sum256(content) - return hex.EncodeToString(sum[:]), nil + return []byte(header + string(body)) } // writeManifest writes the manifest JSON atomically. @@ -183,17 +203,46 @@ func writeManifest(dst string, manifest *RecoveryManifest) error { return atomicWrite(dst, append(data, '\n'), 0644) } -// copyFileChecksum copies src→dst and returns the sha256 of the copied bytes. -func copyFileChecksum(src, dst string) (string, error) { - data, err := os.ReadFile(src) +// readManifest reads an existing recovery-unit manifest (nil if absent or unparseable). +func readManifest(path string) *RecoveryManifest { + data, err := os.ReadFile(path) if err != nil { - return "", err + return nil } - if err := atomicWrite(dst, data, 0644); err != nil { - return "", err + var m RecoveryManifest + if json.Unmarshal(data, &m) != nil { + return nil } + return &m +} + +func sha256Hex(data []byte) string { sum := sha256.Sum256(data) - return hex.EncodeToString(sum[:]), nil + return hex.EncodeToString(sum[:]) +} + +func stringMapEqual(a, b map[string]string) bool { + if len(a) != len(b) { + return false + } + for k, v := range a { + if b[k] != v { + return false + } + } + return true +} + +func stringSliceEqual(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true } // listFileNames returns the names of files with the given suffix in dir (sorted, none if absent). diff --git a/controller/internal/backup/recovery_unit_test.go b/controller/internal/backup/recovery_unit_test.go index dfc1aa6..f4a30a3 100644 --- a/controller/internal/backup/recovery_unit_test.go +++ b/controller/internal/backup/recovery_unit_test.go @@ -126,6 +126,52 @@ func TestCaptureRecoveryUnitIsSecretFree(t *testing.T) { }) } +// TestCaptureRecoveryUnitIdempotent proves the checksum-skip guard: a second capture with unchanged +// config does NOT rewrite the manifest (CreatedAt stable), but a config change DOES. +func TestCaptureRecoveryUnitIdempotent(t *testing.T) { + tmp := t.TempDir() + stackDir := filepath.Join(tmp, "stack") + drive := filepath.Join(tmp, "drive") + mustWrite(t, filepath.Join(stackDir, "docker-compose.yml"), "services:\n app:\n image: ex/app:1\n") + mustWrite(t, filepath.Join(AppDBDumpPath(drive, "ex"), "ex.sql"), "d") + + info := RecoveryInfo{StackDir: stackDir, DisplayName: "Ex", ImagePins: []string{"ex/app:1"}, + NonSecretEnv: map[string]string{"SUBDOMAIN": "ex"}} + m := &Manager{logger: log.New(io.Discard, "", 0), systemDataPath: filepath.Join(tmp, "sys"), + stackProvider: &fakeRecoveryProvider{info: info, hdd: drive}, version: "v1"} + + manifestPath := RecoveryUnitManifestPath(drive, "ex") + if err := m.CaptureRecoveryUnit("ex"); err != nil { + t.Fatal(err) + } + first := readManifest(manifestPath) + if first == nil { + t.Fatal("manifest not written") + } + + // Second capture, unchanged → skipped (manifest byte-identical incl. CreatedAt). + if err := m.CaptureRecoveryUnit("ex"); err != nil { + t.Fatal(err) + } + if again := readManifest(manifestPath); again.CreatedAt != first.CreatedAt { + t.Errorf("idempotent capture rewrote manifest: %q -> %q", first.CreatedAt, again.CreatedAt) + } + + // Change the compose → must rewrite (config checksum differs). + mustWrite(t, filepath.Join(stackDir, "docker-compose.yml"), "services:\n app:\n image: ex/app:2\n") + m.stackProvider.(*fakeRecoveryProvider).info.ImagePins = []string{"ex/app:2"} + if err := m.CaptureRecoveryUnit("ex"); err != nil { + t.Fatal(err) + } + changed := readManifest(manifestPath) + if len(changed.ImagePins) != 1 || changed.ImagePins[0] != "ex/app:2" { + t.Errorf("config change not captured: %v", changed.ImagePins) + } + if changed.Checksums["docker-compose.yml"] == first.Checksums["docker-compose.yml"] { + t.Errorf("compose checksum should change after edit") + } +} + func mustWrite(t *testing.T, path, content string) { t.Helper() if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {