package backup import ( "fmt" "os" "path/filepath" "time" "gopkg.in/yaml.v3" ) // reconcileRestoreSecrets merges the recovery unit's non-secret env with the secrets recovered from // the guest's own app.yaml, and applies the FAIL-CLOSED data-key gate. It is the safety-critical heart // of Phase 2b and is deliberately a pure function (no I/O) so it can be exhaustively unit-tested. // // Policy (per the Phase 2 design — see REPORT/CHANGELOG): // - Regenerate NOTHING. Every secret comes from the guest (live rootfs, or PBS whole-guest restore). // - A missing DATA-ENCRYPTING key (`dataKeyNames`) is FATAL: regenerating it would render the // restored data unreadable, so we refuse and tell the operator to do a PBS whole-guest restore. // - A missing resettable secret (DB password, admin password) is NON-fatal: it's returned in // `missing` so the caller can warn; the app may simply need a credential reset, no data is lost. func reconcileRestoreSecrets(nonSecretEnv, recoveredSecrets map[string]string, secretNames, dataKeyNames []string) (fullEnv map[string]string, missing []string, err error) { fullEnv = make(map[string]string, len(nonSecretEnv)+len(secretNames)) for k, v := range nonSecretEnv { fullEnv[k] = v } have := func(n string) bool { v, ok := recoveredSecrets[n] return ok && v != "" } for _, n := range secretNames { if have(n) { fullEnv[n] = recoveredSecrets[n] } else { missing = append(missing, n) } } // Fail-closed: any unrecoverable data-encrypting key aborts the restore. var missingDataKeys []string for _, dk := range dataKeyNames { if !have(dk) { missingDataKeys = append(missingDataKeys, dk) } } if len(missingDataKeys) > 0 { return nil, missing, fmt.Errorf( "refusing to restore: data-encrypting key(s) %v could not be recovered from the guest's app.yaml — "+ "a PBS whole-guest restore is required first (regenerating the key would render stored data unreadable)", missingDataKeys) } return fullEnv, missing, nil } // readStrippedEnv parses the non-secret env from a recovery unit's secret-stripped app.yaml. func readStrippedEnv(path string) map[string]string { data, err := os.ReadFile(path) if err != nil { return map[string]string{} } var s strippedAppYaml if yaml.Unmarshal(data, &s) != nil || s.Env == nil { return map[string]string{} } return s.Env } // RestoreFromRecoveryUnit recreates an app from its on-drive recovery unit + the guest's own secrets. // // It reads the unit manifest, recovers the secret values from the guest's live app.yaml, applies the // fail-closed data-key gate, restores the named-volume data from the unit's tars, then restores the // app's definition from the unit and redeploys it with the reconstructed env (re-pulling the pinned // image). No secret is ever regenerated, and no secret is read from the unit. If no unit exists it // falls back to the legacy volume-only RestoreApp. func (m *Manager) RestoreFromRecoveryUnit(stackName string) error { if m.stackProvider == nil { return fmt.Errorf("stack provider not configured") } m.mu.Lock() if m.running { m.mu.Unlock() return fmt.Errorf("backup or restore already in progress") } m.running = true m.mu.Unlock() defer func() { m.mu.Lock() m.running = false m.mu.Unlock() }() drivePath := m.GetAppDrivePath(stackName) if drivePath == "" || !filepath.IsAbs(drivePath) { return fmt.Errorf("cannot determine drive path for %s", stackName) } nsRoot := m.namespaceRoot(drivePath) manifest := readManifest(RecoveryUnitManifestPath(nsRoot, stackName)) if manifest == nil { m.logger.Printf("[WARN] [backup] No recovery unit for %s — falling back to volume-only restore", stackName) m.mu.Lock() m.running = false // RestoreApp re-acquires the running flag m.mu.Unlock() return m.RestoreApp(stackName, "") } composeDir := RecoveryUnitComposePath(nsRoot, stackName) nonSecretEnv := readStrippedEnv(filepath.Join(composeDir, "app.yaml")) // Recover secrets from the GUEST (never the unit), then apply the fail-closed gate. recovered := m.stackProvider.RecoverStackSecrets(stackName, manifest.SecretEnvVars) fullEnv, missing, err := reconcileRestoreSecrets(nonSecretEnv, recovered, manifest.SecretEnvVars, manifest.DataKeyEnvVars) if err != nil { m.logger.Printf("[ERROR] [backup] Restore REFUSED for %s: %v", stackName, err) return err } if len(missing) > 0 { m.logger.Printf("[WARN] [backup] Restore %s: %d resettable secret(s) unrecoverable %v — proceeding (may need a credential reset; no data-key affected)", stackName, len(missing), missing) } m.logger.Printf("[INFO] [backup] Restoring %s from recovery unit: images=%d, secrets recovered=%d/%d, data_keys=%d", stackName, len(manifest.ImagePins), len(manifest.SecretEnvVars)-len(missing), len(manifest.SecretEnvVars), len(manifest.DataKeyEnvVars)) // Stop, restore named-volume data, then recreate the definition + redeploy with the recovered env. if err := m.stackProvider.StopStack(stackName); err != nil { m.logger.Printf("[WARN] [backup] could not stop %s before restore: %v (continuing)", stackName, err) } if err := m.restoreDockerVolumes(stackName, drivePath); err != nil { m.logger.Printf("[WARN] [backup] volume restore for %s: %v (continuing)", stackName, err) } if err := m.stackProvider.RecreateStackFromUnit(stackName, composeDir, fullEnv); err != nil { return fmt.Errorf("recreating %s from unit: %w", stackName, err) } if err := m.waitForHealthy(stackName, 90*time.Second); err != nil { m.logger.Printf("[WARN] [backup] %s restored but health check failed: %v", stackName, err) } m.logger.Printf("[INFO] [backup] Restore-from-unit completed: %s", stackName) return nil }