From 7863e62f2999b31c7eda18a03cf6ae5855f99835 Mon Sep 17 00:00:00 2001 From: kisfenyo Date: Sat, 13 Jun 2026 11:12:43 +0200 Subject: [PATCH] =?UTF-8?q?v0.54.0:=20Phase=202b=20=E2=80=94=20restore-fro?= =?UTF-8?q?m-recovery-unit=20+=20fail-closed=20data-key=20gate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restore recreates an app from its on-drive unit + the guest's own secrets, regenerating nothing. reconcileRestoreSecrets (pure, unit-tested) merges the unit's non-secret env with secrets recovered from the live app.yaml and FAILS CLOSED if a data-encrypting key is unrecoverable (refuse — a PBS whole-guest restore is needed — rather than regenerate and corrupt). Resettable secrets missing → warn + proceed. - backup: RestoreFromRecoveryUnit (manifest -> recover secrets -> gate -> restore volumes -> recreate definition + redeploy w/ re-pull); falls back to volume-only. - seams: RecoverStackSecrets/RecreateStackFromUnit (adapter +encKey), stacks.RedeployFromEnv. Wired into /backup/restore. - tests: gate (refuse/proceed/verbatim) + data_key parsing. Gate + reconcile + data_key parsing unit-tested; capture live-validated (v0.53.1). Full readable-data e2e vs AdventureLog needs the auth-gated dashboard restore — pending. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 28 ++++ controller/cmd/controller/main.go | 45 ++++++ controller/internal/appbackup/appdata.go | 13 ++ .../internal/backup/recovery_unit_test.go | 4 + controller/internal/backup/restore_unit.go | 140 ++++++++++++++++++ .../internal/backup/restore_unit_test.go | 62 ++++++++ controller/internal/stacks/deploy.go | 42 ++++++ controller/internal/stacks/metadata_test.go | 40 +++++ controller/internal/web/handlers.go | 4 +- 9 files changed, 377 insertions(+), 1 deletion(-) create mode 100644 controller/internal/backup/restore_unit.go create mode 100644 controller/internal/backup/restore_unit_test.go create mode 100644 controller/internal/stacks/metadata_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 50bf84c..6d93e72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,33 @@ ## Changelog +### v0.54.0 — Phase 2b: restore-from-recovery-unit + fail-closed data-key gate (2026-06-13) + +Restore now recreates an app from its on-drive recovery unit **plus the guest's own secrets** — never +from secrets stored in the unit (there are none), and **regenerating nothing**. + +- **Fail-closed data-key gate** (`reconcileRestoreSecrets`, `internal/backup/restore_unit.go` — a pure, + exhaustively unit-tested function): merges the unit's non-secret env with the secret values recovered + from the guest's live app.yaml. A missing/empty **data-encrypting key** (`data_key`) **aborts the + restore** with a clear message (a PBS whole-guest restore is required) — because regenerating it would + render stored data unreadable. A missing *resettable* secret (DB/admin password) is non-fatal (warn + + proceed; the app may need a credential reset). Secrets are recovered, never regenerated. +- **`RestoreFromRecoveryUnit`**: reads the unit manifest → recovers secrets from the guest + (`RecoverStackSecrets`) → applies the gate → restores named-volume data from the unit's tars → + recovers the app definition from the unit and redeploys with the reconstructed env (re-pulling the + pinned image). Falls back to the legacy volume-only `RestoreApp` if no unit exists. Wired into the + `/backup/restore` web handler. +- **New seams:** `StackDataProvider.RecoverStackSecrets` / `RecreateStackFromUnit` (main.go + `stackAdapter`, with the controller `encKey` for decrypting the live app.yaml); `stacks.Manager. + RedeployFromEnv` (writes app.yaml from the full env incl. locked secrets, then `compose up -d`). +- **Tests:** the gate (all recovered / data-key missing → refuse / empty data-key → refuse / resettable + missing → proceed+warn, recovered values used verbatim) and `data_key` parsing from `.felhom.yml` + (`Metadata.DataKeyEnvVars()`). +- **Validation status:** the gate + reconciliation + data_key parsing are unit-tested (authoritative for + the refuse/proceed/regenerate-nothing behaviour); the capture side is live-validated (v0.53.1, RomM). + The full live **readable-data e2e** against AdventureLog (deploy → back up → restore → confirm the + data decrypts) requires triggering the **auth-gated** `/backup/restore` from the dashboard — pending an + operator-run on the demo. + ### v0.53.1 — Phase 2: recovery units refresh on the periodic cache cycle (idempotent) (2026-06-13) The recovery-unit capture now also runs from `RefreshCache` (controller startup + every 5m), not only diff --git a/controller/cmd/controller/main.go b/controller/cmd/controller/main.go index 7c887a9..4bbc03a 100644 --- a/controller/cmd/controller/main.go +++ b/controller/cmd/controller/main.go @@ -218,6 +218,7 @@ func main() { stackProv := &stackAdapter{ mgr: stackMgr, getStoragePaths: func() []settings.StoragePath { return sett.GetStoragePaths() }, + encKey: encKey, } if cfg.Backup.Enabled { backupMgr = backup.NewManager(cfg, sett, logger) @@ -768,6 +769,7 @@ func setupLogger(cfg *config.Config) (*log.Logger, *web.LogBuffer) { type stackAdapter struct { mgr *stacks.Manager getStoragePaths func() []settings.StoragePath + encKey []byte // for decrypting live app.yaml secrets during restore-from-unit } func (a *stackAdapter) GetStackComposePath(name string) (string, bool) { @@ -904,6 +906,49 @@ func (a *stackAdapter) GetStackRecoveryInfo(name string) (backup.RecoveryInfo, b }, true } +// RecoverStackSecrets returns the live decrypted values for the named secret env vars present in the +// stack's app.yaml (the guest's own — live rootfs or PBS-restored). Absent/empty names are omitted; +// the caller's fail-closed gate decides. Secrets come from the guest, never from the recovery unit. +func (a *stackAdapter) RecoverStackSecrets(name string, names []string) map[string]string { + s, ok := a.mgr.GetStack(name) + if !ok { + return nil + } + cfg := stacks.LoadAppConfigDecrypted(filepath.Dir(s.ComposePath), a.encKey) + if cfg == nil { + return nil + } + out := make(map[string]string) + for _, n := range names { + if v, ok := cfg.Env[n]; ok && v != "" { + out[n] = v + } + } + return out +} + +// RecreateStackFromUnit restores the app definition from the unit's compose dir into the stack dir, +// then redeploys with the reconstructed full env (re-pulling the pinned image). Secrets in fullEnv were +// recovered from the guest, never regenerated. +func (a *stackAdapter) RecreateStackFromUnit(name, composeSrcDir string, fullEnv map[string]string) error { + s, ok := a.mgr.GetStack(name) + if !ok { + return fmt.Errorf("stack %q not found", name) + } + stackDir := filepath.Dir(s.ComposePath) + // Recover the app definition from the unit (compose + .felhom.yml) into the stack dir. + for _, fname := range []string{"docker-compose.yml", ".felhom.yml"} { + data, err := os.ReadFile(filepath.Join(composeSrcDir, fname)) + if err != nil { + continue // capture whichever existed + } + if err := os.WriteFile(filepath.Join(stackDir, fname), data, 0644); err != nil { + return fmt.Errorf("restoring %s from unit: %w", fname, err) + } + } + return a.mgr.RedeployFromEnv(name, fullEnv) +} + // RefreshAndIsRunning forces a docker ps scan before checking state. // Called during post-restore health check (~every 5s for up to 90s). // Full refresh is acceptable here since restores are rare operations. diff --git a/controller/internal/appbackup/appdata.go b/controller/internal/appbackup/appdata.go index 1d843e4..b4afb43 100644 --- a/controller/internal/appbackup/appdata.go +++ b/controller/internal/appbackup/appdata.go @@ -30,6 +30,19 @@ type StackDataProvider interface { // from the guest's own app.yaml, live or via the PBS whole-guest snapshot). ok=false if the // stack is unknown. GetStackRecoveryInfo(name string) (RecoveryInfo, bool) + + // --- Phase 2b: restore-from-recovery-unit --- + + // RecoverStackSecrets returns the live decrypted values for the named secret env vars that are + // currently present (non-empty) in the stack's app.yaml (the guest's own — live rootfs, or + // PBS-restored). Names that are absent/empty are simply omitted from the map; the caller's + // fail-closed gate decides what to do. The unit is never the source of secrets. + RecoverStackSecrets(name string, names []string) map[string]string + + // RecreateStackFromUnit restores an app's definition from the unit's compose dir into the stack + // dir, writes app.yaml from fullEnv (encrypting secret fields), and (re-)deploys it via + // `docker compose up -d`, which re-pulls the pinned image. Secrets are NEVER regenerated. + RecreateStackFromUnit(name, composeSrcDir string, fullEnv map[string]string) error } // RecoveryInfo carries everything needed to write a secret-free recovery unit for a stack. diff --git a/controller/internal/backup/recovery_unit_test.go b/controller/internal/backup/recovery_unit_test.go index f4a30a3..1f993f0 100644 --- a/controller/internal/backup/recovery_unit_test.go +++ b/controller/internal/backup/recovery_unit_test.go @@ -30,6 +30,10 @@ func (f *fakeRecoveryProvider) RefreshAndIsRunning(string) bool { retur func (f *fakeRecoveryProvider) GetStackRecoveryInfo(string) (RecoveryInfo, bool) { return f.info, true } +func (f *fakeRecoveryProvider) RecoverStackSecrets(string, []string) map[string]string { return nil } +func (f *fakeRecoveryProvider) RecreateStackFromUnit(string, string, map[string]string) error { + return nil +} // TestCaptureRecoveryUnitIsSecretFree proves the captured unit (a) contains compose+config+manifest, // (b) enumerates the existing dumps, and (c) is SECRET-FREE: a secret value present in the SOURCE diff --git a/controller/internal/backup/restore_unit.go b/controller/internal/backup/restore_unit.go new file mode 100644 index 0000000..5a69eb9 --- /dev/null +++ b/controller/internal/backup/restore_unit.go @@ -0,0 +1,140 @@ +package backup + +import ( + "fmt" + "os" + "path/filepath" + "time" + + "gopkg.in/yaml.v3" +) + +// reconcileRestoreSecrets merges the recovery unit's non-secret env with the secrets recovered from +// the guest's own app.yaml, and applies the FAIL-CLOSED data-key gate. It is the safety-critical heart +// of Phase 2b and is deliberately a pure function (no I/O) so it can be exhaustively unit-tested. +// +// Policy (per the Phase 2 design — see REPORT/CHANGELOG): +// - Regenerate NOTHING. Every secret comes from the guest (live rootfs, or PBS whole-guest restore). +// - A missing DATA-ENCRYPTING key (`dataKeyNames`) is FATAL: regenerating it would render the +// restored data unreadable, so we refuse and tell the operator to do a PBS whole-guest restore. +// - A missing resettable secret (DB password, admin password) is NON-fatal: it's returned in +// `missing` so the caller can warn; the app may simply need a credential reset, no data is lost. +func reconcileRestoreSecrets(nonSecretEnv, recoveredSecrets map[string]string, secretNames, dataKeyNames []string) (fullEnv map[string]string, missing []string, err error) { + fullEnv = make(map[string]string, len(nonSecretEnv)+len(secretNames)) + for k, v := range nonSecretEnv { + fullEnv[k] = v + } + have := func(n string) bool { + v, ok := recoveredSecrets[n] + return ok && v != "" + } + for _, n := range secretNames { + if have(n) { + fullEnv[n] = recoveredSecrets[n] + } else { + missing = append(missing, n) + } + } + // Fail-closed: any unrecoverable data-encrypting key aborts the restore. + var missingDataKeys []string + for _, dk := range dataKeyNames { + if !have(dk) { + missingDataKeys = append(missingDataKeys, dk) + } + } + if len(missingDataKeys) > 0 { + return nil, missing, fmt.Errorf( + "refusing to restore: data-encrypting key(s) %v could not be recovered from the guest's app.yaml — "+ + "a PBS whole-guest restore is required first (regenerating the key would render stored data unreadable)", + missingDataKeys) + } + return fullEnv, missing, nil +} + +// readStrippedEnv parses the non-secret env from a recovery unit's secret-stripped app.yaml. +func readStrippedEnv(path string) map[string]string { + data, err := os.ReadFile(path) + if err != nil { + return map[string]string{} + } + var s strippedAppYaml + if yaml.Unmarshal(data, &s) != nil || s.Env == nil { + return map[string]string{} + } + return s.Env +} + +// RestoreFromRecoveryUnit recreates an app from its on-drive recovery unit + the guest's own secrets. +// +// It reads the unit manifest, recovers the secret values from the guest's live app.yaml, applies the +// fail-closed data-key gate, restores the named-volume data from the unit's tars, then restores the +// app's definition from the unit and redeploys it with the reconstructed env (re-pulling the pinned +// image). No secret is ever regenerated, and no secret is read from the unit. If no unit exists it +// falls back to the legacy volume-only RestoreApp. +func (m *Manager) RestoreFromRecoveryUnit(stackName string) error { + if m.stackProvider == nil { + return fmt.Errorf("stack provider not configured") + } + + m.mu.Lock() + if m.running { + m.mu.Unlock() + return fmt.Errorf("backup or restore already in progress") + } + m.running = true + m.mu.Unlock() + defer func() { + m.mu.Lock() + m.running = false + m.mu.Unlock() + }() + + drivePath := m.GetAppDrivePath(stackName) + if drivePath == "" || !filepath.IsAbs(drivePath) { + return fmt.Errorf("cannot determine drive path for %s", stackName) + } + nsRoot := m.namespaceRoot(drivePath) + + manifest := readManifest(RecoveryUnitManifestPath(nsRoot, stackName)) + if manifest == nil { + m.logger.Printf("[WARN] [backup] No recovery unit for %s — falling back to volume-only restore", stackName) + m.mu.Lock() + m.running = false // RestoreApp re-acquires the running flag + m.mu.Unlock() + return m.RestoreApp(stackName, "") + } + + composeDir := RecoveryUnitComposePath(nsRoot, stackName) + nonSecretEnv := readStrippedEnv(filepath.Join(composeDir, "app.yaml")) + + // Recover secrets from the GUEST (never the unit), then apply the fail-closed gate. + recovered := m.stackProvider.RecoverStackSecrets(stackName, manifest.SecretEnvVars) + fullEnv, missing, err := reconcileRestoreSecrets(nonSecretEnv, recovered, manifest.SecretEnvVars, manifest.DataKeyEnvVars) + if err != nil { + m.logger.Printf("[ERROR] [backup] Restore REFUSED for %s: %v", stackName, err) + return err + } + if len(missing) > 0 { + m.logger.Printf("[WARN] [backup] Restore %s: %d resettable secret(s) unrecoverable %v — proceeding (may need a credential reset; no data-key affected)", + stackName, len(missing), missing) + } + m.logger.Printf("[INFO] [backup] Restoring %s from recovery unit: images=%d, secrets recovered=%d/%d, data_keys=%d", + stackName, len(manifest.ImagePins), len(manifest.SecretEnvVars)-len(missing), len(manifest.SecretEnvVars), len(manifest.DataKeyEnvVars)) + + // Stop, restore named-volume data, then recreate the definition + redeploy with the recovered env. + if err := m.stackProvider.StopStack(stackName); err != nil { + m.logger.Printf("[WARN] [backup] could not stop %s before restore: %v (continuing)", stackName, err) + } + if err := m.restoreDockerVolumes(stackName, drivePath); err != nil { + m.logger.Printf("[WARN] [backup] volume restore for %s: %v (continuing)", stackName, err) + } + if err := m.stackProvider.RecreateStackFromUnit(stackName, composeDir, fullEnv); err != nil { + return fmt.Errorf("recreating %s from unit: %w", stackName, err) + } + if err := m.waitForHealthy(stackName, 90*time.Second); err != nil { + m.logger.Printf("[WARN] [backup] %s restored but health check failed: %v", stackName, err) + } + + m.logger.Printf("[INFO] [backup] Restore-from-unit completed: %s", stackName) + return nil +} diff --git a/controller/internal/backup/restore_unit_test.go b/controller/internal/backup/restore_unit_test.go new file mode 100644 index 0000000..7f2883f --- /dev/null +++ b/controller/internal/backup/restore_unit_test.go @@ -0,0 +1,62 @@ +package backup + +import "testing" + +// TestReconcileRestoreSecrets covers the safety-critical fail-closed gate + secret reconciliation. +func TestReconcileRestoreSecrets(t *testing.T) { + nonSecret := map[string]string{"SUBDOMAIN": "trips", "DOMAIN": "demo-felhom.eu"} + + t.Run("all recovered, no data_key — full env, no error", func(t *testing.T) { + recovered := map[string]string{"DB_PASSWORD": "pw", "SECRET_KEY": "deadbeef"} + full, missing, err := reconcileRestoreSecrets(nonSecret, recovered, + []string{"DB_PASSWORD", "SECRET_KEY"}, nil) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(missing) != 0 { + t.Errorf("missing: %v", missing) + } + // Non-secret + both secrets present, and recovered values used VERBATIM (regenerate nothing). + if full["SUBDOMAIN"] != "trips" || full["DB_PASSWORD"] != "pw" || full["SECRET_KEY"] != "deadbeef" { + t.Errorf("full env wrong: %v", full) + } + }) + + t.Run("data_key missing — FAIL CLOSED (refuse)", func(t *testing.T) { + recovered := map[string]string{"DB_PASSWORD": "pw"} // SECRET_KEY (a data_key) is gone + full, _, err := reconcileRestoreSecrets(nonSecret, recovered, + []string{"DB_PASSWORD", "SECRET_KEY"}, []string{"SECRET_KEY"}) + if err == nil { + t.Fatal("expected fail-closed error for missing data-encrypting key, got nil") + } + if full != nil { + t.Errorf("full env should be nil on refusal, got %v", full) + } + }) + + t.Run("data_key empty value — FAIL CLOSED", func(t *testing.T) { + recovered := map[string]string{"SECRET_KEY": ""} // present but empty == unrecoverable + _, _, err := reconcileRestoreSecrets(nonSecret, recovered, []string{"SECRET_KEY"}, []string{"SECRET_KEY"}) + if err == nil { + t.Fatal("empty data-key value must fail closed") + } + }) + + t.Run("resettable secret missing — proceed with warning", func(t *testing.T) { + recovered := map[string]string{"SECRET_KEY": "deadbeef"} // data_key ok; DB_PASSWORD missing + full, missing, err := reconcileRestoreSecrets(nonSecret, recovered, + []string{"DB_PASSWORD", "SECRET_KEY"}, []string{"SECRET_KEY"}) + if err != nil { + t.Fatalf("a missing resettable secret must NOT fail closed: %v", err) + } + if len(missing) != 1 || missing[0] != "DB_PASSWORD" { + t.Errorf("missing should be [DB_PASSWORD], got %v", missing) + } + if full["SECRET_KEY"] != "deadbeef" { + t.Errorf("data-key should be preserved verbatim: %v", full) + } + if _, present := full["DB_PASSWORD"]; present { + t.Errorf("missing resettable secret should be absent, not regenerated") + } + }) +} diff --git a/controller/internal/stacks/deploy.go b/controller/internal/stacks/deploy.go index 37b5107..74dc911 100644 --- a/controller/internal/stacks/deploy.go +++ b/controller/internal/stacks/deploy.go @@ -431,6 +431,48 @@ func (m *Manager) UpdateStackConfig(name string, values map[string]string) error return m.RefreshStatus() } +// RedeployFromEnv writes app.yaml from the given FULL env (encrypting secret fields) and (re-)deploys +// the stack with `docker compose up -d`, which re-pulls the pinned image. Used by the restore-from-unit +// flow (Phase 2b): unlike UpdateStackConfig it sets the full env INCLUDING locked secrets — which were +// recovered from the guest's own app.yaml, never regenerated. Caller is responsible for the gate. +func (m *Manager) RedeployFromEnv(name string, env map[string]string) error { + stack, ok := m.GetStack(name) + if !ok { + return fmt.Errorf("stack %q not found", name) + } + stackDir := filepath.Dir(stack.ComposePath) + meta := LoadMetadata(stackDir) + + cfg := &AppConfig{ + Deployed: true, + DeployedAt: time.Now().UTC().Format(time.RFC3339), + Env: env, + } + for _, f := range meta.DeployFields { + if f.LockedAfterDeploy { + cfg.LockedFields = append(cfg.LockedFields, f.EnvVar) + } + } + if err := SaveAppConfig(stackDir, cfg, m.encKey, SensitiveEnvVars(&meta)); err != nil { + return fmt.Errorf("saving app config: %w", err) + } + + m.mu.Lock() + if s, ok := m.stacks[name]; ok { + s.Deployed = true + s.AppConfig = cfg + } + m.mu.Unlock() + + m.logger.Printf("[INFO] [stacks] Redeploying %s from recovery unit with %d env vars", name, len(env)) + deployEnv := m.stackEnv(stackDir) // decrypts secrets back for compose + if _, err := m.composeExecCustomEnv(stackDir, deployEnv, "up", "-d"); err != nil { + return fmt.Errorf("compose up: %w", err) + } + m.logPostStartStatus(name, stackDir, deployEnv) + return m.RefreshStatus() +} + // composeExecWithEnv runs a compose command with custom env vars injected. func (m *Manager) composeExecWithEnv(dir string, env map[string]string, args ...string) (string, error) { cmdEnv := os.Environ() diff --git a/controller/internal/stacks/metadata_test.go b/controller/internal/stacks/metadata_test.go new file mode 100644 index 0000000..29f559e --- /dev/null +++ b/controller/internal/stacks/metadata_test.go @@ -0,0 +1,40 @@ +package stacks + +import ( + "os" + "path/filepath" + "testing" +) + +// TestDataKeyParsing proves the catalog `data_key: true` annotation flows through .felhom.yml parsing +// into Metadata.DataKeyEnvVars() — the capture-side half of the Phase 2b fail-closed mechanism. The +// fail-closed gate itself is unit-tested in internal/backup (reconcileRestoreSecrets). +func TestDataKeyParsing(t *testing.T) { + dir := t.TempDir() + // Mirrors adventurelog/.felhom.yml: SECRET_KEY is a data-encrypting key, DB_PASSWORD is resettable. + yml := `display_name: AdventureLog +deploy_fields: + - env_var: SECRET_KEY + label: "Titkosítási kulcs" + type: secret + data_key: true + - env_var: DB_PASSWORD + label: "Adatbázis jelszó" + type: secret +` + if err := os.WriteFile(filepath.Join(dir, ".felhom.yml"), []byte(yml), 0644); err != nil { + t.Fatal(err) + } + + meta := LoadMetadata(dir) + dk := meta.DataKeyEnvVars() + if len(dk) != 1 || dk[0] != "SECRET_KEY" { + t.Fatalf("DataKeyEnvVars() = %v, want [SECRET_KEY]", dk) + } + + // Both secrets are sensitive (stripped from the unit); only SECRET_KEY is a data_key (fail-closed). + sens := SensitiveEnvVars(&meta) + if len(sens) != 2 { + t.Errorf("SensitiveEnvVars() = %v, want both SECRET_KEY and DB_PASSWORD", sens) + } +} diff --git a/controller/internal/web/handlers.go b/controller/internal/web/handlers.go index 89a4ac6..1ab7184 100644 --- a/controller/internal/web/handlers.go +++ b/controller/internal/web/handlers.go @@ -714,7 +714,9 @@ func (s *Server) backupRestoreHandler(w http.ResponseWriter, r *http.Request) { s.logger.Printf("[WARN] [web] Restore requested: stack=%s, snapshot=%s from %s", stackName, snapshotID, r.RemoteAddr) start := time.Now() - err := s.backupMgr.RestoreApp(stackName, snapshotID) + // Phase 2b: restore from the app's recovery unit (recovers secrets from the guest, fail-closed on + // an unrecoverable data-encrypting key; falls back to volume-only restore if no unit exists). + err := s.backupMgr.RestoreFromRecoveryUnit(stackName) if err != nil { s.logger.Printf("[ERROR] [web] Restore failed: %v", err) if s.isDebug() {