v0.53.1: refresh recovery units on periodic cache cycle (idempotent)

CaptureRecoveryUnit now builds content in memory and skips writes when the unit
is already current (checksum + dump-set + version), so it can run from RefreshCache
(startup + every 5m) without thrashing the USB drive. Units now exist shortly after
startup and track config changes without waiting for the daily DB dump. +idempotency test.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-13 10:27:35 +02:00
parent 70eb521cd0
commit eefeeabea3
4 changed files with 151 additions and 43 deletions
+9
View File
@@ -1,5 +1,14 @@
## Changelog
### v0.53.1 — Phase 2: recovery units refresh on the periodic cache cycle (idempotent) (2026-06-13)
The recovery-unit capture now also runs from `RefreshCache` (controller startup + every 5m), not only
the daily DB dump — so a unit exists shortly after startup and stays current with config changes
(redeploy / optional-config) without a 24h wait. `CaptureRecoveryUnit` builds the captured content in
memory and **skips all writes when the unit is already current** (same config checksums + dump set +
controller version), so the periodic refresh does not thrash a spinning USB drive. Added an idempotency
test (unchanged → skip; config change → rewrite).
### v0.53.0 — Phase 2: per-app self-contained recovery unit (capture side, SECRET-FREE) (2026-06-13)
Each app's on-drive backup becomes a complete, recreatable **recovery unit** — not just DB dumps +
+4
View File
@@ -487,6 +487,10 @@ func (m *Manager) RefreshCache(nextDBDump time.Time) {
// Discover app data — all deployed stacks, backup is mandatory
if m.stackProvider != nil {
status.AppDataInfo = DiscoverAppData(m.stackProvider, status.DiscoveredDBs)
// Phase 2: keep each app's recovery unit current with its definition. Idempotent
// (checksum-skip), so this periodic refresh only writes when the config actually changed,
// and ensures units exist shortly after startup without waiting for the daily DB dump.
m.captureAllRecoveryUnits()
}
// Fill in dynamic fields under lock.
+92 -43
View File
@@ -56,6 +56,10 @@ func (m *Manager) SetVersion(v string) {
// CaptureRecoveryUnit writes/refreshes an app's secret-free recovery unit: it captures the
// compose + metadata + a secret-stripped app.yaml into compose/, enumerates the DB/volume dumps
// already present, and writes manifest.json. It NEVER writes a secret value or the Docker image.
//
// Idempotent: it builds the captured content in memory first and SKIPS all writes when the unit is
// already current (same config checksums, same dump set, same controller version) — so it can run on
// the periodic status refresh without thrashing a spinning USB drive.
func (m *Manager) CaptureRecoveryUnit(stackName string) error {
if m.stackProvider == nil {
return fmt.Errorf("no stack provider")
@@ -69,41 +73,60 @@ func (m *Manager) CaptureRecoveryUnit(stackName string) error {
return fmt.Errorf("cannot determine absolute drive path for %s", stackName)
}
nsRoot := m.namespaceRoot(drivePath)
// Build the captured config CONTENT in memory (no writes yet) so we can checksum-compare.
type capFile struct {
name string
data []byte
perm os.FileMode
}
var files []capFile
checksums := make(map[string]string)
var configFiles []string
for _, fname := range []string{"docker-compose.yml", ".felhom.yml"} {
data, err := os.ReadFile(filepath.Join(info.StackDir, fname))
if err != nil {
continue // optional — capture whichever exist
}
files = append(files, capFile{fname, data, 0644})
checksums[fname] = sha256Hex(data)
configFiles = append(configFiles, fname)
}
appYaml := buildStrippedAppYaml(info)
files = append(files, capFile{"app.yaml", appYaml, 0600})
checksums["app.yaml"] = sha256Hex(appYaml)
configFiles = append(configFiles, "app.yaml")
dbDumps := listFileNames(AppDBDumpPath(nsRoot, stackName), ".sql")
volDumps := listFileNames(AppVolumeDumpPath(nsRoot, stackName), ".tar")
version := m.versionLocked()
manifestPath := RecoveryUnitManifestPath(nsRoot, stackName)
// Skip if the unit is already current — avoids needless drive writes on the periodic refresh.
if cur := readManifest(manifestPath); cur != nil &&
cur.ControllerVer == version &&
stringMapEqual(cur.Checksums, checksums) &&
stringSliceEqual(cur.DBDumps, dbDumps) &&
stringSliceEqual(cur.VolumeDumps, volDumps) {
return nil
}
composeDir := RecoveryUnitComposePath(nsRoot, stackName)
if err := os.MkdirAll(composeDir, 0755); err != nil {
return fmt.Errorf("creating recovery-unit compose dir: %w", err)
}
checksums := make(map[string]string)
var configFiles []string
// Capture docker-compose.yml + .felhom.yml verbatim (whichever exist).
for _, fname := range []string{"docker-compose.yml", ".felhom.yml"} {
src := filepath.Join(info.StackDir, fname)
if _, err := os.Stat(src); err != nil {
continue
for _, f := range files {
if err := atomicWrite(filepath.Join(composeDir, f.name), f.data, f.perm); err != nil {
return fmt.Errorf("capturing %s: %w", f.name, err)
}
sum, err := copyFileChecksum(src, filepath.Join(composeDir, fname))
if err != nil {
return fmt.Errorf("capturing %s: %w", fname, err)
}
checksums[fname] = sum
configFiles = append(configFiles, fname)
}
// Write the SECRET-STRIPPED app.yaml (non-secret env only).
sum, err := writeStrippedAppYaml(filepath.Join(composeDir, "app.yaml"), info)
if err != nil {
return fmt.Errorf("writing stripped app.yaml: %w", err)
}
checksums["app.yaml"] = sum
configFiles = append(configFiles, "app.yaml")
manifest := &RecoveryManifest{
SchemaVersion: 1,
AppName: stackName,
DisplayName: info.DisplayName,
ControllerVer: m.versionLocked(),
ControllerVer: version,
CreatedAt: time.Now().UTC().Format(time.RFC3339),
Drive: drivePath,
NamespaceRoot: nsRoot,
@@ -112,11 +135,11 @@ func (m *Manager) CaptureRecoveryUnit(stackName string) error {
DataKeyEnvVars: info.DataKeyEnvVars,
SecretSource: "guest app.yaml (live rootfs) or PBS whole-guest snapshot — never stored in this unit",
ConfigFiles: configFiles,
DBDumps: listFileNames(AppDBDumpPath(nsRoot, stackName), ".sql"),
VolumeDumps: listFileNames(AppVolumeDumpPath(nsRoot, stackName), ".tar"),
DBDumps: dbDumps,
VolumeDumps: volDumps,
Checksums: checksums,
}
if err := writeManifest(RecoveryUnitManifestPath(nsRoot, stackName), manifest); err != nil {
if err := writeManifest(manifestPath, manifest); err != nil {
return fmt.Errorf("writing manifest: %w", err)
}
@@ -154,11 +177,13 @@ type strippedAppYaml struct {
Env map[string]string `yaml:"env"`
}
// writeStrippedAppYaml writes a secret-free app.yaml (non-secret env only) and returns its sha256.
func writeStrippedAppYaml(dst string, info RecoveryInfo) (string, error) {
// buildStrippedAppYaml renders a secret-free app.yaml (non-secret env only) as bytes. Deterministic:
// yaml.v3 sorts map keys and the secret-name list comes in stable metadata order, so identical input
// yields identical bytes (needed for the checksum-skip guard).
func buildStrippedAppYaml(info RecoveryInfo) []byte {
body, err := yaml.Marshal(strippedAppYaml{Deployed: true, Env: info.NonSecretEnv})
if err != nil {
return "", err
body = []byte("deployed: true\nenv: {}\n")
}
header := "# Captured by felhom-controller recovery unit — SECRET-FREE.\n" +
"# Secret/data-key values are intentionally omitted; recover them at restore from the\n" +
@@ -166,12 +191,7 @@ func writeStrippedAppYaml(dst string, info RecoveryInfo) (string, error) {
if len(info.SecretEnvVars) > 0 {
header += "# " + strings.Join(info.SecretEnvVars, ", ") + "\n"
}
content := []byte(header + string(body))
if err := atomicWrite(dst, content, 0600); err != nil {
return "", err
}
sum := sha256.Sum256(content)
return hex.EncodeToString(sum[:]), nil
return []byte(header + string(body))
}
// writeManifest writes the manifest JSON atomically.
@@ -183,17 +203,46 @@ func writeManifest(dst string, manifest *RecoveryManifest) error {
return atomicWrite(dst, append(data, '\n'), 0644)
}
// copyFileChecksum copies src→dst and returns the sha256 of the copied bytes.
func copyFileChecksum(src, dst string) (string, error) {
data, err := os.ReadFile(src)
// readManifest reads an existing recovery-unit manifest (nil if absent or unparseable).
func readManifest(path string) *RecoveryManifest {
data, err := os.ReadFile(path)
if err != nil {
return "", err
return nil
}
if err := atomicWrite(dst, data, 0644); err != nil {
return "", err
var m RecoveryManifest
if json.Unmarshal(data, &m) != nil {
return nil
}
return &m
}
func sha256Hex(data []byte) string {
sum := sha256.Sum256(data)
return hex.EncodeToString(sum[:]), nil
return hex.EncodeToString(sum[:])
}
func stringMapEqual(a, b map[string]string) bool {
if len(a) != len(b) {
return false
}
for k, v := range a {
if b[k] != v {
return false
}
}
return true
}
func stringSliceEqual(a, b []string) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}
// listFileNames returns the names of files with the given suffix in dir (sorted, none if absent).
@@ -126,6 +126,52 @@ func TestCaptureRecoveryUnitIsSecretFree(t *testing.T) {
})
}
// TestCaptureRecoveryUnitIdempotent proves the checksum-skip guard: a second capture with unchanged
// config does NOT rewrite the manifest (CreatedAt stable), but a config change DOES.
func TestCaptureRecoveryUnitIdempotent(t *testing.T) {
tmp := t.TempDir()
stackDir := filepath.Join(tmp, "stack")
drive := filepath.Join(tmp, "drive")
mustWrite(t, filepath.Join(stackDir, "docker-compose.yml"), "services:\n app:\n image: ex/app:1\n")
mustWrite(t, filepath.Join(AppDBDumpPath(drive, "ex"), "ex.sql"), "d")
info := RecoveryInfo{StackDir: stackDir, DisplayName: "Ex", ImagePins: []string{"ex/app:1"},
NonSecretEnv: map[string]string{"SUBDOMAIN": "ex"}}
m := &Manager{logger: log.New(io.Discard, "", 0), systemDataPath: filepath.Join(tmp, "sys"),
stackProvider: &fakeRecoveryProvider{info: info, hdd: drive}, version: "v1"}
manifestPath := RecoveryUnitManifestPath(drive, "ex")
if err := m.CaptureRecoveryUnit("ex"); err != nil {
t.Fatal(err)
}
first := readManifest(manifestPath)
if first == nil {
t.Fatal("manifest not written")
}
// Second capture, unchanged → skipped (manifest byte-identical incl. CreatedAt).
if err := m.CaptureRecoveryUnit("ex"); err != nil {
t.Fatal(err)
}
if again := readManifest(manifestPath); again.CreatedAt != first.CreatedAt {
t.Errorf("idempotent capture rewrote manifest: %q -> %q", first.CreatedAt, again.CreatedAt)
}
// Change the compose → must rewrite (config checksum differs).
mustWrite(t, filepath.Join(stackDir, "docker-compose.yml"), "services:\n app:\n image: ex/app:2\n")
m.stackProvider.(*fakeRecoveryProvider).info.ImagePins = []string{"ex/app:2"}
if err := m.CaptureRecoveryUnit("ex"); err != nil {
t.Fatal(err)
}
changed := readManifest(manifestPath)
if len(changed.ImagePins) != 1 || changed.ImagePins[0] != "ex/app:2" {
t.Errorf("config change not captured: %v", changed.ImagePins)
}
if changed.Checksums["docker-compose.yml"] == first.Checksums["docker-compose.yml"] {
t.Errorf("compose checksum should change after edit")
}
}
func mustWrite(t *testing.T, path, content string) {
t.Helper()
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {