v0.54.0: Phase 2b — restore-from-recovery-unit + fail-closed data-key gate
Restore recreates an app from its on-drive unit + the guest's own secrets, regenerating nothing. reconcileRestoreSecrets (pure, unit-tested) merges the unit's non-secret env with secrets recovered from the live app.yaml and FAILS CLOSED if a data-encrypting key is unrecoverable (refuse — a PBS whole-guest restore is needed — rather than regenerate and corrupt). Resettable secrets missing → warn + proceed. - backup: RestoreFromRecoveryUnit (manifest -> recover secrets -> gate -> restore volumes -> recreate definition + redeploy w/ re-pull); falls back to volume-only. - seams: RecoverStackSecrets/RecreateStackFromUnit (adapter +encKey), stacks.RedeployFromEnv. Wired into /backup/restore. - tests: gate (refuse/proceed/verbatim) + data_key parsing. Gate + reconcile + data_key parsing unit-tested; capture live-validated (v0.53.1). Full readable-data e2e vs AdventureLog needs the auth-gated dashboard restore — pending. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -30,6 +30,10 @@ func (f *fakeRecoveryProvider) RefreshAndIsRunning(string) bool { retur
|
||||
func (f *fakeRecoveryProvider) GetStackRecoveryInfo(string) (RecoveryInfo, bool) {
|
||||
return f.info, true
|
||||
}
|
||||
func (f *fakeRecoveryProvider) RecoverStackSecrets(string, []string) map[string]string { return nil }
|
||||
func (f *fakeRecoveryProvider) RecreateStackFromUnit(string, string, map[string]string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// TestCaptureRecoveryUnitIsSecretFree proves the captured unit (a) contains compose+config+manifest,
|
||||
// (b) enumerates the existing dumps, and (c) is SECRET-FREE: a secret value present in the SOURCE
|
||||
|
||||
@@ -0,0 +1,140 @@
|
||||
package backup
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// reconcileRestoreSecrets merges the recovery unit's non-secret env with the secrets recovered from
|
||||
// the guest's own app.yaml, and applies the FAIL-CLOSED data-key gate. It is the safety-critical heart
|
||||
// of Phase 2b and is deliberately a pure function (no I/O) so it can be exhaustively unit-tested.
|
||||
//
|
||||
// Policy (per the Phase 2 design — see REPORT/CHANGELOG):
|
||||
// - Regenerate NOTHING. Every secret comes from the guest (live rootfs, or PBS whole-guest restore).
|
||||
// - A missing DATA-ENCRYPTING key (`dataKeyNames`) is FATAL: regenerating it would render the
|
||||
// restored data unreadable, so we refuse and tell the operator to do a PBS whole-guest restore.
|
||||
// - A missing resettable secret (DB password, admin password) is NON-fatal: it's returned in
|
||||
// `missing` so the caller can warn; the app may simply need a credential reset, no data is lost.
|
||||
func reconcileRestoreSecrets(nonSecretEnv, recoveredSecrets map[string]string, secretNames, dataKeyNames []string) (fullEnv map[string]string, missing []string, err error) {
|
||||
fullEnv = make(map[string]string, len(nonSecretEnv)+len(secretNames))
|
||||
for k, v := range nonSecretEnv {
|
||||
fullEnv[k] = v
|
||||
}
|
||||
have := func(n string) bool {
|
||||
v, ok := recoveredSecrets[n]
|
||||
return ok && v != ""
|
||||
}
|
||||
for _, n := range secretNames {
|
||||
if have(n) {
|
||||
fullEnv[n] = recoveredSecrets[n]
|
||||
} else {
|
||||
missing = append(missing, n)
|
||||
}
|
||||
}
|
||||
// Fail-closed: any unrecoverable data-encrypting key aborts the restore.
|
||||
var missingDataKeys []string
|
||||
for _, dk := range dataKeyNames {
|
||||
if !have(dk) {
|
||||
missingDataKeys = append(missingDataKeys, dk)
|
||||
}
|
||||
}
|
||||
if len(missingDataKeys) > 0 {
|
||||
return nil, missing, fmt.Errorf(
|
||||
"refusing to restore: data-encrypting key(s) %v could not be recovered from the guest's app.yaml — "+
|
||||
"a PBS whole-guest restore is required first (regenerating the key would render stored data unreadable)",
|
||||
missingDataKeys)
|
||||
}
|
||||
return fullEnv, missing, nil
|
||||
}
|
||||
|
||||
// readStrippedEnv parses the non-secret env from a recovery unit's secret-stripped app.yaml.
|
||||
func readStrippedEnv(path string) map[string]string {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return map[string]string{}
|
||||
}
|
||||
var s strippedAppYaml
|
||||
if yaml.Unmarshal(data, &s) != nil || s.Env == nil {
|
||||
return map[string]string{}
|
||||
}
|
||||
return s.Env
|
||||
}
|
||||
|
||||
// RestoreFromRecoveryUnit recreates an app from its on-drive recovery unit + the guest's own secrets.
|
||||
//
|
||||
// It reads the unit manifest, recovers the secret values from the guest's live app.yaml, applies the
|
||||
// fail-closed data-key gate, restores the named-volume data from the unit's tars, then restores the
|
||||
// app's definition from the unit and redeploys it with the reconstructed env (re-pulling the pinned
|
||||
// image). No secret is ever regenerated, and no secret is read from the unit. If no unit exists it
|
||||
// falls back to the legacy volume-only RestoreApp.
|
||||
func (m *Manager) RestoreFromRecoveryUnit(stackName string) error {
|
||||
if m.stackProvider == nil {
|
||||
return fmt.Errorf("stack provider not configured")
|
||||
}
|
||||
|
||||
m.mu.Lock()
|
||||
if m.running {
|
||||
m.mu.Unlock()
|
||||
return fmt.Errorf("backup or restore already in progress")
|
||||
}
|
||||
m.running = true
|
||||
m.mu.Unlock()
|
||||
defer func() {
|
||||
m.mu.Lock()
|
||||
m.running = false
|
||||
m.mu.Unlock()
|
||||
}()
|
||||
|
||||
drivePath := m.GetAppDrivePath(stackName)
|
||||
if drivePath == "" || !filepath.IsAbs(drivePath) {
|
||||
return fmt.Errorf("cannot determine drive path for %s", stackName)
|
||||
}
|
||||
nsRoot := m.namespaceRoot(drivePath)
|
||||
|
||||
manifest := readManifest(RecoveryUnitManifestPath(nsRoot, stackName))
|
||||
if manifest == nil {
|
||||
m.logger.Printf("[WARN] [backup] No recovery unit for %s — falling back to volume-only restore", stackName)
|
||||
m.mu.Lock()
|
||||
m.running = false // RestoreApp re-acquires the running flag
|
||||
m.mu.Unlock()
|
||||
return m.RestoreApp(stackName, "")
|
||||
}
|
||||
|
||||
composeDir := RecoveryUnitComposePath(nsRoot, stackName)
|
||||
nonSecretEnv := readStrippedEnv(filepath.Join(composeDir, "app.yaml"))
|
||||
|
||||
// Recover secrets from the GUEST (never the unit), then apply the fail-closed gate.
|
||||
recovered := m.stackProvider.RecoverStackSecrets(stackName, manifest.SecretEnvVars)
|
||||
fullEnv, missing, err := reconcileRestoreSecrets(nonSecretEnv, recovered, manifest.SecretEnvVars, manifest.DataKeyEnvVars)
|
||||
if err != nil {
|
||||
m.logger.Printf("[ERROR] [backup] Restore REFUSED for %s: %v", stackName, err)
|
||||
return err
|
||||
}
|
||||
if len(missing) > 0 {
|
||||
m.logger.Printf("[WARN] [backup] Restore %s: %d resettable secret(s) unrecoverable %v — proceeding (may need a credential reset; no data-key affected)",
|
||||
stackName, len(missing), missing)
|
||||
}
|
||||
m.logger.Printf("[INFO] [backup] Restoring %s from recovery unit: images=%d, secrets recovered=%d/%d, data_keys=%d",
|
||||
stackName, len(manifest.ImagePins), len(manifest.SecretEnvVars)-len(missing), len(manifest.SecretEnvVars), len(manifest.DataKeyEnvVars))
|
||||
|
||||
// Stop, restore named-volume data, then recreate the definition + redeploy with the recovered env.
|
||||
if err := m.stackProvider.StopStack(stackName); err != nil {
|
||||
m.logger.Printf("[WARN] [backup] could not stop %s before restore: %v (continuing)", stackName, err)
|
||||
}
|
||||
if err := m.restoreDockerVolumes(stackName, drivePath); err != nil {
|
||||
m.logger.Printf("[WARN] [backup] volume restore for %s: %v (continuing)", stackName, err)
|
||||
}
|
||||
if err := m.stackProvider.RecreateStackFromUnit(stackName, composeDir, fullEnv); err != nil {
|
||||
return fmt.Errorf("recreating %s from unit: %w", stackName, err)
|
||||
}
|
||||
if err := m.waitForHealthy(stackName, 90*time.Second); err != nil {
|
||||
m.logger.Printf("[WARN] [backup] %s restored but health check failed: %v", stackName, err)
|
||||
}
|
||||
|
||||
m.logger.Printf("[INFO] [backup] Restore-from-unit completed: %s", stackName)
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
package backup
|
||||
|
||||
import "testing"
|
||||
|
||||
// TestReconcileRestoreSecrets covers the safety-critical fail-closed gate + secret reconciliation.
|
||||
func TestReconcileRestoreSecrets(t *testing.T) {
|
||||
nonSecret := map[string]string{"SUBDOMAIN": "trips", "DOMAIN": "demo-felhom.eu"}
|
||||
|
||||
t.Run("all recovered, no data_key — full env, no error", func(t *testing.T) {
|
||||
recovered := map[string]string{"DB_PASSWORD": "pw", "SECRET_KEY": "deadbeef"}
|
||||
full, missing, err := reconcileRestoreSecrets(nonSecret, recovered,
|
||||
[]string{"DB_PASSWORD", "SECRET_KEY"}, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(missing) != 0 {
|
||||
t.Errorf("missing: %v", missing)
|
||||
}
|
||||
// Non-secret + both secrets present, and recovered values used VERBATIM (regenerate nothing).
|
||||
if full["SUBDOMAIN"] != "trips" || full["DB_PASSWORD"] != "pw" || full["SECRET_KEY"] != "deadbeef" {
|
||||
t.Errorf("full env wrong: %v", full)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("data_key missing — FAIL CLOSED (refuse)", func(t *testing.T) {
|
||||
recovered := map[string]string{"DB_PASSWORD": "pw"} // SECRET_KEY (a data_key) is gone
|
||||
full, _, err := reconcileRestoreSecrets(nonSecret, recovered,
|
||||
[]string{"DB_PASSWORD", "SECRET_KEY"}, []string{"SECRET_KEY"})
|
||||
if err == nil {
|
||||
t.Fatal("expected fail-closed error for missing data-encrypting key, got nil")
|
||||
}
|
||||
if full != nil {
|
||||
t.Errorf("full env should be nil on refusal, got %v", full)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("data_key empty value — FAIL CLOSED", func(t *testing.T) {
|
||||
recovered := map[string]string{"SECRET_KEY": ""} // present but empty == unrecoverable
|
||||
_, _, err := reconcileRestoreSecrets(nonSecret, recovered, []string{"SECRET_KEY"}, []string{"SECRET_KEY"})
|
||||
if err == nil {
|
||||
t.Fatal("empty data-key value must fail closed")
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("resettable secret missing — proceed with warning", func(t *testing.T) {
|
||||
recovered := map[string]string{"SECRET_KEY": "deadbeef"} // data_key ok; DB_PASSWORD missing
|
||||
full, missing, err := reconcileRestoreSecrets(nonSecret, recovered,
|
||||
[]string{"DB_PASSWORD", "SECRET_KEY"}, []string{"SECRET_KEY"})
|
||||
if err != nil {
|
||||
t.Fatalf("a missing resettable secret must NOT fail closed: %v", err)
|
||||
}
|
||||
if len(missing) != 1 || missing[0] != "DB_PASSWORD" {
|
||||
t.Errorf("missing should be [DB_PASSWORD], got %v", missing)
|
||||
}
|
||||
if full["SECRET_KEY"] != "deadbeef" {
|
||||
t.Errorf("data-key should be preserved verbatim: %v", full)
|
||||
}
|
||||
if _, present := full["DB_PASSWORD"]; present {
|
||||
t.Errorf("missing resettable secret should be absent, not regenerated")
|
||||
}
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user