v0.53.1: refresh recovery units on periodic cache cycle (idempotent)
CaptureRecoveryUnit now builds content in memory and skips writes when the unit is already current (checksum + dump-set + version), so it can run from RefreshCache (startup + every 5m) without thrashing the USB drive. Units now exist shortly after startup and track config changes without waiting for the daily DB dump. +idempotency test. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -487,6 +487,10 @@ func (m *Manager) RefreshCache(nextDBDump time.Time) {
|
||||
// Discover app data — all deployed stacks, backup is mandatory
|
||||
if m.stackProvider != nil {
|
||||
status.AppDataInfo = DiscoverAppData(m.stackProvider, status.DiscoveredDBs)
|
||||
// Phase 2: keep each app's recovery unit current with its definition. Idempotent
|
||||
// (checksum-skip), so this periodic refresh only writes when the config actually changed,
|
||||
// and ensures units exist shortly after startup without waiting for the daily DB dump.
|
||||
m.captureAllRecoveryUnits()
|
||||
}
|
||||
|
||||
// Fill in dynamic fields under lock.
|
||||
|
||||
@@ -56,6 +56,10 @@ func (m *Manager) SetVersion(v string) {
|
||||
// CaptureRecoveryUnit writes/refreshes an app's secret-free recovery unit: it captures the
|
||||
// compose + metadata + a secret-stripped app.yaml into compose/, enumerates the DB/volume dumps
|
||||
// already present, and writes manifest.json. It NEVER writes a secret value or the Docker image.
|
||||
//
|
||||
// Idempotent: it builds the captured content in memory first and SKIPS all writes when the unit is
|
||||
// already current (same config checksums, same dump set, same controller version) — so it can run on
|
||||
// the periodic status refresh without thrashing a spinning USB drive.
|
||||
func (m *Manager) CaptureRecoveryUnit(stackName string) error {
|
||||
if m.stackProvider == nil {
|
||||
return fmt.Errorf("no stack provider")
|
||||
@@ -69,41 +73,60 @@ func (m *Manager) CaptureRecoveryUnit(stackName string) error {
|
||||
return fmt.Errorf("cannot determine absolute drive path for %s", stackName)
|
||||
}
|
||||
nsRoot := m.namespaceRoot(drivePath)
|
||||
|
||||
// Build the captured config CONTENT in memory (no writes yet) so we can checksum-compare.
|
||||
type capFile struct {
|
||||
name string
|
||||
data []byte
|
||||
perm os.FileMode
|
||||
}
|
||||
var files []capFile
|
||||
checksums := make(map[string]string)
|
||||
var configFiles []string
|
||||
for _, fname := range []string{"docker-compose.yml", ".felhom.yml"} {
|
||||
data, err := os.ReadFile(filepath.Join(info.StackDir, fname))
|
||||
if err != nil {
|
||||
continue // optional — capture whichever exist
|
||||
}
|
||||
files = append(files, capFile{fname, data, 0644})
|
||||
checksums[fname] = sha256Hex(data)
|
||||
configFiles = append(configFiles, fname)
|
||||
}
|
||||
appYaml := buildStrippedAppYaml(info)
|
||||
files = append(files, capFile{"app.yaml", appYaml, 0600})
|
||||
checksums["app.yaml"] = sha256Hex(appYaml)
|
||||
configFiles = append(configFiles, "app.yaml")
|
||||
|
||||
dbDumps := listFileNames(AppDBDumpPath(nsRoot, stackName), ".sql")
|
||||
volDumps := listFileNames(AppVolumeDumpPath(nsRoot, stackName), ".tar")
|
||||
version := m.versionLocked()
|
||||
|
||||
manifestPath := RecoveryUnitManifestPath(nsRoot, stackName)
|
||||
|
||||
// Skip if the unit is already current — avoids needless drive writes on the periodic refresh.
|
||||
if cur := readManifest(manifestPath); cur != nil &&
|
||||
cur.ControllerVer == version &&
|
||||
stringMapEqual(cur.Checksums, checksums) &&
|
||||
stringSliceEqual(cur.DBDumps, dbDumps) &&
|
||||
stringSliceEqual(cur.VolumeDumps, volDumps) {
|
||||
return nil
|
||||
}
|
||||
|
||||
composeDir := RecoveryUnitComposePath(nsRoot, stackName)
|
||||
if err := os.MkdirAll(composeDir, 0755); err != nil {
|
||||
return fmt.Errorf("creating recovery-unit compose dir: %w", err)
|
||||
}
|
||||
|
||||
checksums := make(map[string]string)
|
||||
var configFiles []string
|
||||
|
||||
// Capture docker-compose.yml + .felhom.yml verbatim (whichever exist).
|
||||
for _, fname := range []string{"docker-compose.yml", ".felhom.yml"} {
|
||||
src := filepath.Join(info.StackDir, fname)
|
||||
if _, err := os.Stat(src); err != nil {
|
||||
continue
|
||||
for _, f := range files {
|
||||
if err := atomicWrite(filepath.Join(composeDir, f.name), f.data, f.perm); err != nil {
|
||||
return fmt.Errorf("capturing %s: %w", f.name, err)
|
||||
}
|
||||
sum, err := copyFileChecksum(src, filepath.Join(composeDir, fname))
|
||||
if err != nil {
|
||||
return fmt.Errorf("capturing %s: %w", fname, err)
|
||||
}
|
||||
checksums[fname] = sum
|
||||
configFiles = append(configFiles, fname)
|
||||
}
|
||||
|
||||
// Write the SECRET-STRIPPED app.yaml (non-secret env only).
|
||||
sum, err := writeStrippedAppYaml(filepath.Join(composeDir, "app.yaml"), info)
|
||||
if err != nil {
|
||||
return fmt.Errorf("writing stripped app.yaml: %w", err)
|
||||
}
|
||||
checksums["app.yaml"] = sum
|
||||
configFiles = append(configFiles, "app.yaml")
|
||||
|
||||
manifest := &RecoveryManifest{
|
||||
SchemaVersion: 1,
|
||||
AppName: stackName,
|
||||
DisplayName: info.DisplayName,
|
||||
ControllerVer: m.versionLocked(),
|
||||
ControllerVer: version,
|
||||
CreatedAt: time.Now().UTC().Format(time.RFC3339),
|
||||
Drive: drivePath,
|
||||
NamespaceRoot: nsRoot,
|
||||
@@ -112,11 +135,11 @@ func (m *Manager) CaptureRecoveryUnit(stackName string) error {
|
||||
DataKeyEnvVars: info.DataKeyEnvVars,
|
||||
SecretSource: "guest app.yaml (live rootfs) or PBS whole-guest snapshot — never stored in this unit",
|
||||
ConfigFiles: configFiles,
|
||||
DBDumps: listFileNames(AppDBDumpPath(nsRoot, stackName), ".sql"),
|
||||
VolumeDumps: listFileNames(AppVolumeDumpPath(nsRoot, stackName), ".tar"),
|
||||
DBDumps: dbDumps,
|
||||
VolumeDumps: volDumps,
|
||||
Checksums: checksums,
|
||||
}
|
||||
if err := writeManifest(RecoveryUnitManifestPath(nsRoot, stackName), manifest); err != nil {
|
||||
if err := writeManifest(manifestPath, manifest); err != nil {
|
||||
return fmt.Errorf("writing manifest: %w", err)
|
||||
}
|
||||
|
||||
@@ -154,11 +177,13 @@ type strippedAppYaml struct {
|
||||
Env map[string]string `yaml:"env"`
|
||||
}
|
||||
|
||||
// writeStrippedAppYaml writes a secret-free app.yaml (non-secret env only) and returns its sha256.
|
||||
func writeStrippedAppYaml(dst string, info RecoveryInfo) (string, error) {
|
||||
// buildStrippedAppYaml renders a secret-free app.yaml (non-secret env only) as bytes. Deterministic:
|
||||
// yaml.v3 sorts map keys and the secret-name list comes in stable metadata order, so identical input
|
||||
// yields identical bytes (needed for the checksum-skip guard).
|
||||
func buildStrippedAppYaml(info RecoveryInfo) []byte {
|
||||
body, err := yaml.Marshal(strippedAppYaml{Deployed: true, Env: info.NonSecretEnv})
|
||||
if err != nil {
|
||||
return "", err
|
||||
body = []byte("deployed: true\nenv: {}\n")
|
||||
}
|
||||
header := "# Captured by felhom-controller recovery unit — SECRET-FREE.\n" +
|
||||
"# Secret/data-key values are intentionally omitted; recover them at restore from the\n" +
|
||||
@@ -166,12 +191,7 @@ func writeStrippedAppYaml(dst string, info RecoveryInfo) (string, error) {
|
||||
if len(info.SecretEnvVars) > 0 {
|
||||
header += "# " + strings.Join(info.SecretEnvVars, ", ") + "\n"
|
||||
}
|
||||
content := []byte(header + string(body))
|
||||
if err := atomicWrite(dst, content, 0600); err != nil {
|
||||
return "", err
|
||||
}
|
||||
sum := sha256.Sum256(content)
|
||||
return hex.EncodeToString(sum[:]), nil
|
||||
return []byte(header + string(body))
|
||||
}
|
||||
|
||||
// writeManifest writes the manifest JSON atomically.
|
||||
@@ -183,17 +203,46 @@ func writeManifest(dst string, manifest *RecoveryManifest) error {
|
||||
return atomicWrite(dst, append(data, '\n'), 0644)
|
||||
}
|
||||
|
||||
// copyFileChecksum copies src→dst and returns the sha256 of the copied bytes.
|
||||
func copyFileChecksum(src, dst string) (string, error) {
|
||||
data, err := os.ReadFile(src)
|
||||
// readManifest reads an existing recovery-unit manifest (nil if absent or unparseable).
|
||||
func readManifest(path string) *RecoveryManifest {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
return nil
|
||||
}
|
||||
if err := atomicWrite(dst, data, 0644); err != nil {
|
||||
return "", err
|
||||
var m RecoveryManifest
|
||||
if json.Unmarshal(data, &m) != nil {
|
||||
return nil
|
||||
}
|
||||
return &m
|
||||
}
|
||||
|
||||
func sha256Hex(data []byte) string {
|
||||
sum := sha256.Sum256(data)
|
||||
return hex.EncodeToString(sum[:]), nil
|
||||
return hex.EncodeToString(sum[:])
|
||||
}
|
||||
|
||||
func stringMapEqual(a, b map[string]string) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
for k, v := range a {
|
||||
if b[k] != v {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func stringSliceEqual(a, b []string) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
for i := range a {
|
||||
if a[i] != b[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// listFileNames returns the names of files with the given suffix in dir (sorted, none if absent).
|
||||
|
||||
@@ -126,6 +126,52 @@ func TestCaptureRecoveryUnitIsSecretFree(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// TestCaptureRecoveryUnitIdempotent proves the checksum-skip guard: a second capture with unchanged
|
||||
// config does NOT rewrite the manifest (CreatedAt stable), but a config change DOES.
|
||||
func TestCaptureRecoveryUnitIdempotent(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
stackDir := filepath.Join(tmp, "stack")
|
||||
drive := filepath.Join(tmp, "drive")
|
||||
mustWrite(t, filepath.Join(stackDir, "docker-compose.yml"), "services:\n app:\n image: ex/app:1\n")
|
||||
mustWrite(t, filepath.Join(AppDBDumpPath(drive, "ex"), "ex.sql"), "d")
|
||||
|
||||
info := RecoveryInfo{StackDir: stackDir, DisplayName: "Ex", ImagePins: []string{"ex/app:1"},
|
||||
NonSecretEnv: map[string]string{"SUBDOMAIN": "ex"}}
|
||||
m := &Manager{logger: log.New(io.Discard, "", 0), systemDataPath: filepath.Join(tmp, "sys"),
|
||||
stackProvider: &fakeRecoveryProvider{info: info, hdd: drive}, version: "v1"}
|
||||
|
||||
manifestPath := RecoveryUnitManifestPath(drive, "ex")
|
||||
if err := m.CaptureRecoveryUnit("ex"); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
first := readManifest(manifestPath)
|
||||
if first == nil {
|
||||
t.Fatal("manifest not written")
|
||||
}
|
||||
|
||||
// Second capture, unchanged → skipped (manifest byte-identical incl. CreatedAt).
|
||||
if err := m.CaptureRecoveryUnit("ex"); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if again := readManifest(manifestPath); again.CreatedAt != first.CreatedAt {
|
||||
t.Errorf("idempotent capture rewrote manifest: %q -> %q", first.CreatedAt, again.CreatedAt)
|
||||
}
|
||||
|
||||
// Change the compose → must rewrite (config checksum differs).
|
||||
mustWrite(t, filepath.Join(stackDir, "docker-compose.yml"), "services:\n app:\n image: ex/app:2\n")
|
||||
m.stackProvider.(*fakeRecoveryProvider).info.ImagePins = []string{"ex/app:2"}
|
||||
if err := m.CaptureRecoveryUnit("ex"); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
changed := readManifest(manifestPath)
|
||||
if len(changed.ImagePins) != 1 || changed.ImagePins[0] != "ex/app:2" {
|
||||
t.Errorf("config change not captured: %v", changed.ImagePins)
|
||||
}
|
||||
if changed.Checksums["docker-compose.yml"] == first.Checksums["docker-compose.yml"] {
|
||||
t.Errorf("compose checksum should change after edit")
|
||||
}
|
||||
}
|
||||
|
||||
func mustWrite(t *testing.T, path, content string) {
|
||||
t.Helper()
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
|
||||
Reference in New Issue
Block a user