v0.53.1: refresh recovery units on periodic cache cycle (idempotent)

CaptureRecoveryUnit now builds content in memory and skips writes when the unit
is already current (checksum + dump-set + version), so it can run from RefreshCache
(startup + every 5m) without thrashing the USB drive. Units now exist shortly after
startup and track config changes without waiting for the daily DB dump. +idempotency test.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-13 10:27:35 +02:00
parent 70eb521cd0
commit eefeeabea3
4 changed files with 151 additions and 43 deletions
+92 -43
View File
@@ -56,6 +56,10 @@ func (m *Manager) SetVersion(v string) {
// CaptureRecoveryUnit writes/refreshes an app's secret-free recovery unit: it captures the
// compose + metadata + a secret-stripped app.yaml into compose/, enumerates the DB/volume dumps
// already present, and writes manifest.json. It NEVER writes a secret value or the Docker image.
//
// Idempotent: it builds the captured content in memory first and SKIPS all writes when the unit is
// already current (same config checksums, same dump set, same controller version) — so it can run on
// the periodic status refresh without thrashing a spinning USB drive.
func (m *Manager) CaptureRecoveryUnit(stackName string) error {
if m.stackProvider == nil {
return fmt.Errorf("no stack provider")
@@ -69,41 +73,60 @@ func (m *Manager) CaptureRecoveryUnit(stackName string) error {
return fmt.Errorf("cannot determine absolute drive path for %s", stackName)
}
nsRoot := m.namespaceRoot(drivePath)
// Build the captured config CONTENT in memory (no writes yet) so we can checksum-compare.
type capFile struct {
name string
data []byte
perm os.FileMode
}
var files []capFile
checksums := make(map[string]string)
var configFiles []string
for _, fname := range []string{"docker-compose.yml", ".felhom.yml"} {
data, err := os.ReadFile(filepath.Join(info.StackDir, fname))
if err != nil {
continue // optional — capture whichever exist
}
files = append(files, capFile{fname, data, 0644})
checksums[fname] = sha256Hex(data)
configFiles = append(configFiles, fname)
}
appYaml := buildStrippedAppYaml(info)
files = append(files, capFile{"app.yaml", appYaml, 0600})
checksums["app.yaml"] = sha256Hex(appYaml)
configFiles = append(configFiles, "app.yaml")
dbDumps := listFileNames(AppDBDumpPath(nsRoot, stackName), ".sql")
volDumps := listFileNames(AppVolumeDumpPath(nsRoot, stackName), ".tar")
version := m.versionLocked()
manifestPath := RecoveryUnitManifestPath(nsRoot, stackName)
// Skip if the unit is already current — avoids needless drive writes on the periodic refresh.
if cur := readManifest(manifestPath); cur != nil &&
cur.ControllerVer == version &&
stringMapEqual(cur.Checksums, checksums) &&
stringSliceEqual(cur.DBDumps, dbDumps) &&
stringSliceEqual(cur.VolumeDumps, volDumps) {
return nil
}
composeDir := RecoveryUnitComposePath(nsRoot, stackName)
if err := os.MkdirAll(composeDir, 0755); err != nil {
return fmt.Errorf("creating recovery-unit compose dir: %w", err)
}
checksums := make(map[string]string)
var configFiles []string
// Capture docker-compose.yml + .felhom.yml verbatim (whichever exist).
for _, fname := range []string{"docker-compose.yml", ".felhom.yml"} {
src := filepath.Join(info.StackDir, fname)
if _, err := os.Stat(src); err != nil {
continue
for _, f := range files {
if err := atomicWrite(filepath.Join(composeDir, f.name), f.data, f.perm); err != nil {
return fmt.Errorf("capturing %s: %w", f.name, err)
}
sum, err := copyFileChecksum(src, filepath.Join(composeDir, fname))
if err != nil {
return fmt.Errorf("capturing %s: %w", fname, err)
}
checksums[fname] = sum
configFiles = append(configFiles, fname)
}
// Write the SECRET-STRIPPED app.yaml (non-secret env only).
sum, err := writeStrippedAppYaml(filepath.Join(composeDir, "app.yaml"), info)
if err != nil {
return fmt.Errorf("writing stripped app.yaml: %w", err)
}
checksums["app.yaml"] = sum
configFiles = append(configFiles, "app.yaml")
manifest := &RecoveryManifest{
SchemaVersion: 1,
AppName: stackName,
DisplayName: info.DisplayName,
ControllerVer: m.versionLocked(),
ControllerVer: version,
CreatedAt: time.Now().UTC().Format(time.RFC3339),
Drive: drivePath,
NamespaceRoot: nsRoot,
@@ -112,11 +135,11 @@ func (m *Manager) CaptureRecoveryUnit(stackName string) error {
DataKeyEnvVars: info.DataKeyEnvVars,
SecretSource: "guest app.yaml (live rootfs) or PBS whole-guest snapshot — never stored in this unit",
ConfigFiles: configFiles,
DBDumps: listFileNames(AppDBDumpPath(nsRoot, stackName), ".sql"),
VolumeDumps: listFileNames(AppVolumeDumpPath(nsRoot, stackName), ".tar"),
DBDumps: dbDumps,
VolumeDumps: volDumps,
Checksums: checksums,
}
if err := writeManifest(RecoveryUnitManifestPath(nsRoot, stackName), manifest); err != nil {
if err := writeManifest(manifestPath, manifest); err != nil {
return fmt.Errorf("writing manifest: %w", err)
}
@@ -154,11 +177,13 @@ type strippedAppYaml struct {
Env map[string]string `yaml:"env"`
}
// writeStrippedAppYaml writes a secret-free app.yaml (non-secret env only) and returns its sha256.
func writeStrippedAppYaml(dst string, info RecoveryInfo) (string, error) {
// buildStrippedAppYaml renders a secret-free app.yaml (non-secret env only) as bytes. Deterministic:
// yaml.v3 sorts map keys and the secret-name list comes in stable metadata order, so identical input
// yields identical bytes (needed for the checksum-skip guard).
func buildStrippedAppYaml(info RecoveryInfo) []byte {
body, err := yaml.Marshal(strippedAppYaml{Deployed: true, Env: info.NonSecretEnv})
if err != nil {
return "", err
body = []byte("deployed: true\nenv: {}\n")
}
header := "# Captured by felhom-controller recovery unit — SECRET-FREE.\n" +
"# Secret/data-key values are intentionally omitted; recover them at restore from the\n" +
@@ -166,12 +191,7 @@ func writeStrippedAppYaml(dst string, info RecoveryInfo) (string, error) {
if len(info.SecretEnvVars) > 0 {
header += "# " + strings.Join(info.SecretEnvVars, ", ") + "\n"
}
content := []byte(header + string(body))
if err := atomicWrite(dst, content, 0600); err != nil {
return "", err
}
sum := sha256.Sum256(content)
return hex.EncodeToString(sum[:]), nil
return []byte(header + string(body))
}
// writeManifest writes the manifest JSON atomically.
@@ -183,17 +203,46 @@ func writeManifest(dst string, manifest *RecoveryManifest) error {
return atomicWrite(dst, append(data, '\n'), 0644)
}
// copyFileChecksum copies src→dst and returns the sha256 of the copied bytes.
func copyFileChecksum(src, dst string) (string, error) {
data, err := os.ReadFile(src)
// readManifest reads an existing recovery-unit manifest (nil if absent or unparseable).
func readManifest(path string) *RecoveryManifest {
data, err := os.ReadFile(path)
if err != nil {
return "", err
return nil
}
if err := atomicWrite(dst, data, 0644); err != nil {
return "", err
var m RecoveryManifest
if json.Unmarshal(data, &m) != nil {
return nil
}
return &m
}
func sha256Hex(data []byte) string {
sum := sha256.Sum256(data)
return hex.EncodeToString(sum[:]), nil
return hex.EncodeToString(sum[:])
}
func stringMapEqual(a, b map[string]string) bool {
if len(a) != len(b) {
return false
}
for k, v := range a {
if b[k] != v {
return false
}
}
return true
}
func stringSliceEqual(a, b []string) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}
// listFileNames returns the names of files with the given suffix in dir (sorted, none if absent).