v0.53.1: refresh recovery units on periodic cache cycle (idempotent)
CaptureRecoveryUnit now builds content in memory and skips writes when the unit is already current (checksum + dump-set + version), so it can run from RefreshCache (startup + every 5m) without thrashing the USB drive. Units now exist shortly after startup and track config changes without waiting for the daily DB dump. +idempotency test. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,14 @@
|
|||||||
## Changelog
|
## Changelog
|
||||||
|
|
||||||
|
### v0.53.1 — Phase 2: recovery units refresh on the periodic cache cycle (idempotent) (2026-06-13)
|
||||||
|
|
||||||
|
The recovery-unit capture now also runs from `RefreshCache` (controller startup + every 5m), not only
|
||||||
|
the daily DB dump — so a unit exists shortly after startup and stays current with config changes
|
||||||
|
(redeploy / optional-config) without a 24h wait. `CaptureRecoveryUnit` builds the captured content in
|
||||||
|
memory and **skips all writes when the unit is already current** (same config checksums + dump set +
|
||||||
|
controller version), so the periodic refresh does not thrash a spinning USB drive. Added an idempotency
|
||||||
|
test (unchanged → skip; config change → rewrite).
|
||||||
|
|
||||||
### v0.53.0 — Phase 2: per-app self-contained recovery unit (capture side, SECRET-FREE) (2026-06-13)
|
### v0.53.0 — Phase 2: per-app self-contained recovery unit (capture side, SECRET-FREE) (2026-06-13)
|
||||||
|
|
||||||
Each app's on-drive backup becomes a complete, recreatable **recovery unit** — not just DB dumps +
|
Each app's on-drive backup becomes a complete, recreatable **recovery unit** — not just DB dumps +
|
||||||
|
|||||||
@@ -487,6 +487,10 @@ func (m *Manager) RefreshCache(nextDBDump time.Time) {
|
|||||||
// Discover app data — all deployed stacks, backup is mandatory
|
// Discover app data — all deployed stacks, backup is mandatory
|
||||||
if m.stackProvider != nil {
|
if m.stackProvider != nil {
|
||||||
status.AppDataInfo = DiscoverAppData(m.stackProvider, status.DiscoveredDBs)
|
status.AppDataInfo = DiscoverAppData(m.stackProvider, status.DiscoveredDBs)
|
||||||
|
// Phase 2: keep each app's recovery unit current with its definition. Idempotent
|
||||||
|
// (checksum-skip), so this periodic refresh only writes when the config actually changed,
|
||||||
|
// and ensures units exist shortly after startup without waiting for the daily DB dump.
|
||||||
|
m.captureAllRecoveryUnits()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fill in dynamic fields under lock.
|
// Fill in dynamic fields under lock.
|
||||||
|
|||||||
@@ -56,6 +56,10 @@ func (m *Manager) SetVersion(v string) {
|
|||||||
// CaptureRecoveryUnit writes/refreshes an app's secret-free recovery unit: it captures the
|
// CaptureRecoveryUnit writes/refreshes an app's secret-free recovery unit: it captures the
|
||||||
// compose + metadata + a secret-stripped app.yaml into compose/, enumerates the DB/volume dumps
|
// compose + metadata + a secret-stripped app.yaml into compose/, enumerates the DB/volume dumps
|
||||||
// already present, and writes manifest.json. It NEVER writes a secret value or the Docker image.
|
// already present, and writes manifest.json. It NEVER writes a secret value or the Docker image.
|
||||||
|
//
|
||||||
|
// Idempotent: it builds the captured content in memory first and SKIPS all writes when the unit is
|
||||||
|
// already current (same config checksums, same dump set, same controller version) — so it can run on
|
||||||
|
// the periodic status refresh without thrashing a spinning USB drive.
|
||||||
func (m *Manager) CaptureRecoveryUnit(stackName string) error {
|
func (m *Manager) CaptureRecoveryUnit(stackName string) error {
|
||||||
if m.stackProvider == nil {
|
if m.stackProvider == nil {
|
||||||
return fmt.Errorf("no stack provider")
|
return fmt.Errorf("no stack provider")
|
||||||
@@ -69,41 +73,60 @@ func (m *Manager) CaptureRecoveryUnit(stackName string) error {
|
|||||||
return fmt.Errorf("cannot determine absolute drive path for %s", stackName)
|
return fmt.Errorf("cannot determine absolute drive path for %s", stackName)
|
||||||
}
|
}
|
||||||
nsRoot := m.namespaceRoot(drivePath)
|
nsRoot := m.namespaceRoot(drivePath)
|
||||||
|
|
||||||
|
// Build the captured config CONTENT in memory (no writes yet) so we can checksum-compare.
|
||||||
|
type capFile struct {
|
||||||
|
name string
|
||||||
|
data []byte
|
||||||
|
perm os.FileMode
|
||||||
|
}
|
||||||
|
var files []capFile
|
||||||
|
checksums := make(map[string]string)
|
||||||
|
var configFiles []string
|
||||||
|
for _, fname := range []string{"docker-compose.yml", ".felhom.yml"} {
|
||||||
|
data, err := os.ReadFile(filepath.Join(info.StackDir, fname))
|
||||||
|
if err != nil {
|
||||||
|
continue // optional — capture whichever exist
|
||||||
|
}
|
||||||
|
files = append(files, capFile{fname, data, 0644})
|
||||||
|
checksums[fname] = sha256Hex(data)
|
||||||
|
configFiles = append(configFiles, fname)
|
||||||
|
}
|
||||||
|
appYaml := buildStrippedAppYaml(info)
|
||||||
|
files = append(files, capFile{"app.yaml", appYaml, 0600})
|
||||||
|
checksums["app.yaml"] = sha256Hex(appYaml)
|
||||||
|
configFiles = append(configFiles, "app.yaml")
|
||||||
|
|
||||||
|
dbDumps := listFileNames(AppDBDumpPath(nsRoot, stackName), ".sql")
|
||||||
|
volDumps := listFileNames(AppVolumeDumpPath(nsRoot, stackName), ".tar")
|
||||||
|
version := m.versionLocked()
|
||||||
|
|
||||||
|
manifestPath := RecoveryUnitManifestPath(nsRoot, stackName)
|
||||||
|
|
||||||
|
// Skip if the unit is already current — avoids needless drive writes on the periodic refresh.
|
||||||
|
if cur := readManifest(manifestPath); cur != nil &&
|
||||||
|
cur.ControllerVer == version &&
|
||||||
|
stringMapEqual(cur.Checksums, checksums) &&
|
||||||
|
stringSliceEqual(cur.DBDumps, dbDumps) &&
|
||||||
|
stringSliceEqual(cur.VolumeDumps, volDumps) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
composeDir := RecoveryUnitComposePath(nsRoot, stackName)
|
composeDir := RecoveryUnitComposePath(nsRoot, stackName)
|
||||||
if err := os.MkdirAll(composeDir, 0755); err != nil {
|
if err := os.MkdirAll(composeDir, 0755); err != nil {
|
||||||
return fmt.Errorf("creating recovery-unit compose dir: %w", err)
|
return fmt.Errorf("creating recovery-unit compose dir: %w", err)
|
||||||
}
|
}
|
||||||
|
for _, f := range files {
|
||||||
checksums := make(map[string]string)
|
if err := atomicWrite(filepath.Join(composeDir, f.name), f.data, f.perm); err != nil {
|
||||||
var configFiles []string
|
return fmt.Errorf("capturing %s: %w", f.name, err)
|
||||||
|
|
||||||
// Capture docker-compose.yml + .felhom.yml verbatim (whichever exist).
|
|
||||||
for _, fname := range []string{"docker-compose.yml", ".felhom.yml"} {
|
|
||||||
src := filepath.Join(info.StackDir, fname)
|
|
||||||
if _, err := os.Stat(src); err != nil {
|
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
sum, err := copyFileChecksum(src, filepath.Join(composeDir, fname))
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("capturing %s: %w", fname, err)
|
|
||||||
}
|
}
|
||||||
checksums[fname] = sum
|
|
||||||
configFiles = append(configFiles, fname)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write the SECRET-STRIPPED app.yaml (non-secret env only).
|
|
||||||
sum, err := writeStrippedAppYaml(filepath.Join(composeDir, "app.yaml"), info)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("writing stripped app.yaml: %w", err)
|
|
||||||
}
|
|
||||||
checksums["app.yaml"] = sum
|
|
||||||
configFiles = append(configFiles, "app.yaml")
|
|
||||||
|
|
||||||
manifest := &RecoveryManifest{
|
manifest := &RecoveryManifest{
|
||||||
SchemaVersion: 1,
|
SchemaVersion: 1,
|
||||||
AppName: stackName,
|
AppName: stackName,
|
||||||
DisplayName: info.DisplayName,
|
DisplayName: info.DisplayName,
|
||||||
ControllerVer: m.versionLocked(),
|
ControllerVer: version,
|
||||||
CreatedAt: time.Now().UTC().Format(time.RFC3339),
|
CreatedAt: time.Now().UTC().Format(time.RFC3339),
|
||||||
Drive: drivePath,
|
Drive: drivePath,
|
||||||
NamespaceRoot: nsRoot,
|
NamespaceRoot: nsRoot,
|
||||||
@@ -112,11 +135,11 @@ func (m *Manager) CaptureRecoveryUnit(stackName string) error {
|
|||||||
DataKeyEnvVars: info.DataKeyEnvVars,
|
DataKeyEnvVars: info.DataKeyEnvVars,
|
||||||
SecretSource: "guest app.yaml (live rootfs) or PBS whole-guest snapshot — never stored in this unit",
|
SecretSource: "guest app.yaml (live rootfs) or PBS whole-guest snapshot — never stored in this unit",
|
||||||
ConfigFiles: configFiles,
|
ConfigFiles: configFiles,
|
||||||
DBDumps: listFileNames(AppDBDumpPath(nsRoot, stackName), ".sql"),
|
DBDumps: dbDumps,
|
||||||
VolumeDumps: listFileNames(AppVolumeDumpPath(nsRoot, stackName), ".tar"),
|
VolumeDumps: volDumps,
|
||||||
Checksums: checksums,
|
Checksums: checksums,
|
||||||
}
|
}
|
||||||
if err := writeManifest(RecoveryUnitManifestPath(nsRoot, stackName), manifest); err != nil {
|
if err := writeManifest(manifestPath, manifest); err != nil {
|
||||||
return fmt.Errorf("writing manifest: %w", err)
|
return fmt.Errorf("writing manifest: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -154,11 +177,13 @@ type strippedAppYaml struct {
|
|||||||
Env map[string]string `yaml:"env"`
|
Env map[string]string `yaml:"env"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// writeStrippedAppYaml writes a secret-free app.yaml (non-secret env only) and returns its sha256.
|
// buildStrippedAppYaml renders a secret-free app.yaml (non-secret env only) as bytes. Deterministic:
|
||||||
func writeStrippedAppYaml(dst string, info RecoveryInfo) (string, error) {
|
// yaml.v3 sorts map keys and the secret-name list comes in stable metadata order, so identical input
|
||||||
|
// yields identical bytes (needed for the checksum-skip guard).
|
||||||
|
func buildStrippedAppYaml(info RecoveryInfo) []byte {
|
||||||
body, err := yaml.Marshal(strippedAppYaml{Deployed: true, Env: info.NonSecretEnv})
|
body, err := yaml.Marshal(strippedAppYaml{Deployed: true, Env: info.NonSecretEnv})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
body = []byte("deployed: true\nenv: {}\n")
|
||||||
}
|
}
|
||||||
header := "# Captured by felhom-controller recovery unit — SECRET-FREE.\n" +
|
header := "# Captured by felhom-controller recovery unit — SECRET-FREE.\n" +
|
||||||
"# Secret/data-key values are intentionally omitted; recover them at restore from the\n" +
|
"# Secret/data-key values are intentionally omitted; recover them at restore from the\n" +
|
||||||
@@ -166,12 +191,7 @@ func writeStrippedAppYaml(dst string, info RecoveryInfo) (string, error) {
|
|||||||
if len(info.SecretEnvVars) > 0 {
|
if len(info.SecretEnvVars) > 0 {
|
||||||
header += "# " + strings.Join(info.SecretEnvVars, ", ") + "\n"
|
header += "# " + strings.Join(info.SecretEnvVars, ", ") + "\n"
|
||||||
}
|
}
|
||||||
content := []byte(header + string(body))
|
return []byte(header + string(body))
|
||||||
if err := atomicWrite(dst, content, 0600); err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
sum := sha256.Sum256(content)
|
|
||||||
return hex.EncodeToString(sum[:]), nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// writeManifest writes the manifest JSON atomically.
|
// writeManifest writes the manifest JSON atomically.
|
||||||
@@ -183,17 +203,46 @@ func writeManifest(dst string, manifest *RecoveryManifest) error {
|
|||||||
return atomicWrite(dst, append(data, '\n'), 0644)
|
return atomicWrite(dst, append(data, '\n'), 0644)
|
||||||
}
|
}
|
||||||
|
|
||||||
// copyFileChecksum copies src→dst and returns the sha256 of the copied bytes.
|
// readManifest reads an existing recovery-unit manifest (nil if absent or unparseable).
|
||||||
func copyFileChecksum(src, dst string) (string, error) {
|
func readManifest(path string) *RecoveryManifest {
|
||||||
data, err := os.ReadFile(src)
|
data, err := os.ReadFile(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return nil
|
||||||
}
|
}
|
||||||
if err := atomicWrite(dst, data, 0644); err != nil {
|
var m RecoveryManifest
|
||||||
return "", err
|
if json.Unmarshal(data, &m) != nil {
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
return &m
|
||||||
|
}
|
||||||
|
|
||||||
|
func sha256Hex(data []byte) string {
|
||||||
sum := sha256.Sum256(data)
|
sum := sha256.Sum256(data)
|
||||||
return hex.EncodeToString(sum[:]), nil
|
return hex.EncodeToString(sum[:])
|
||||||
|
}
|
||||||
|
|
||||||
|
func stringMapEqual(a, b map[string]string) bool {
|
||||||
|
if len(a) != len(b) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for k, v := range a {
|
||||||
|
if b[k] != v {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func stringSliceEqual(a, b []string) bool {
|
||||||
|
if len(a) != len(b) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for i := range a {
|
||||||
|
if a[i] != b[i] {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
// listFileNames returns the names of files with the given suffix in dir (sorted, none if absent).
|
// listFileNames returns the names of files with the given suffix in dir (sorted, none if absent).
|
||||||
|
|||||||
@@ -126,6 +126,52 @@ func TestCaptureRecoveryUnitIsSecretFree(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestCaptureRecoveryUnitIdempotent proves the checksum-skip guard: a second capture with unchanged
|
||||||
|
// config does NOT rewrite the manifest (CreatedAt stable), but a config change DOES.
|
||||||
|
func TestCaptureRecoveryUnitIdempotent(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
stackDir := filepath.Join(tmp, "stack")
|
||||||
|
drive := filepath.Join(tmp, "drive")
|
||||||
|
mustWrite(t, filepath.Join(stackDir, "docker-compose.yml"), "services:\n app:\n image: ex/app:1\n")
|
||||||
|
mustWrite(t, filepath.Join(AppDBDumpPath(drive, "ex"), "ex.sql"), "d")
|
||||||
|
|
||||||
|
info := RecoveryInfo{StackDir: stackDir, DisplayName: "Ex", ImagePins: []string{"ex/app:1"},
|
||||||
|
NonSecretEnv: map[string]string{"SUBDOMAIN": "ex"}}
|
||||||
|
m := &Manager{logger: log.New(io.Discard, "", 0), systemDataPath: filepath.Join(tmp, "sys"),
|
||||||
|
stackProvider: &fakeRecoveryProvider{info: info, hdd: drive}, version: "v1"}
|
||||||
|
|
||||||
|
manifestPath := RecoveryUnitManifestPath(drive, "ex")
|
||||||
|
if err := m.CaptureRecoveryUnit("ex"); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
first := readManifest(manifestPath)
|
||||||
|
if first == nil {
|
||||||
|
t.Fatal("manifest not written")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second capture, unchanged → skipped (manifest byte-identical incl. CreatedAt).
|
||||||
|
if err := m.CaptureRecoveryUnit("ex"); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if again := readManifest(manifestPath); again.CreatedAt != first.CreatedAt {
|
||||||
|
t.Errorf("idempotent capture rewrote manifest: %q -> %q", first.CreatedAt, again.CreatedAt)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Change the compose → must rewrite (config checksum differs).
|
||||||
|
mustWrite(t, filepath.Join(stackDir, "docker-compose.yml"), "services:\n app:\n image: ex/app:2\n")
|
||||||
|
m.stackProvider.(*fakeRecoveryProvider).info.ImagePins = []string{"ex/app:2"}
|
||||||
|
if err := m.CaptureRecoveryUnit("ex"); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
changed := readManifest(manifestPath)
|
||||||
|
if len(changed.ImagePins) != 1 || changed.ImagePins[0] != "ex/app:2" {
|
||||||
|
t.Errorf("config change not captured: %v", changed.ImagePins)
|
||||||
|
}
|
||||||
|
if changed.Checksums["docker-compose.yml"] == first.Checksums["docker-compose.yml"] {
|
||||||
|
t.Errorf("compose checksum should change after edit")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func mustWrite(t *testing.T, path, content string) {
|
func mustWrite(t *testing.T, path, content string) {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
|||||||
Reference in New Issue
Block a user