diff --git a/CHANGELOG.md b/CHANGELOG.md index aac2e0c..5c63101 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,40 @@ ## Changelog +### v0.53.0 — Phase 2: per-app self-contained recovery unit (capture side, SECRET-FREE) (2026-06-13) + +Each app's on-drive backup becomes a complete, recreatable **recovery unit** — not just DB dumps + +volume tars, but the app's *definition* too, so it can be recreated. The unit is **secret-free by +design** (decided after reading the actual hub code: the hub is deliberately zero-knowledge and holds +no app secrets; app.yaml + the encryption key live on the guest rootfs → already inside the PBS +whole-guest snapshot). Secrets/data-keys are recovered at restore from the guest's own app.yaml (live, +or via PBS) — **never stored in the unit, never regenerated**. + +- **Unit layout** (rooted at the existing `backups/primary//` — no risky dump-dir migration): + `compose/` (docker-compose.yml + .felhom.yml + a **secret-stripped** app.yaml) + the existing + `db-dumps/` + `volume-dumps/` + `manifest.json`. New path helpers `RecoveryUnitPath` / + `RecoveryUnitComposePath` / `RecoveryUnitManifestPath` in `internal/appbackup/paths.go` + (`AppDBDumpPath`/`AppVolumeDumpPath` refactored onto `RecoveryUnitPath` — identical resolved paths). +- **Secret-free manifest** (`internal/backup/recovery_unit.go`): app id, display name, controller + version, timestamp, drive, namespace root, pinned **image tags** (image NOT stored — re-pulled on + restore), the **NAMES** of secret env vars (values never stored), the `data_key` env-var names, the + explicit `secret_source` note ("guest app.yaml (live) or PBS — never stored in this unit"), captured + config-file list, enumerated dumps, and sha256 checksums of the captured config. +- **Capture has no secret access:** non-secret env is plaintext in app.yaml; the capture simply excludes + the secret-named keys (plus a defensive `crypto.IsEncrypted` guard), so it reads no secret value. New + `StackDataProvider.GetStackRecoveryInfo` + `RecoveryInfo` (in `appbackup`), implemented by the main.go + `stackAdapter`; `ParseComposeImages` extracts the image pins. +- **`data_key` annotation** (`DeployField.DataKey`, `Metadata.DataKeyEnvVars()`): marks a + data-encrypting key (e.g. AdventureLog's "Titkosítási kulcs", `SECRET_KEY`) — a **fail-closed** safety + annotation for restore (refuse + warn rather than regenerate-and-corrupt), NOT a per-secret + preserve/regenerate decision. Catalog: `adventurelog/.felhom.yml` `SECRET_KEY` marked `data_key: true`. +- **Wired into the dump flow:** `RunDBDumps` refreshes every deployed app's recovery unit after the DB + dumps (best-effort per app; skips disconnected/decommissioned drives). Capture test + (`recovery_unit_test.go`) proves the unit is secret-free (a secret in the source app.yaml never + appears in the unit) and the manifest structure. +- **NOT in this increment (next):** the restore-from-unit *recreate* (re-pull + compose-up + secret + recovery from guest/PBS) and its fail-closed `data_key` gate, with live AdventureLog readable-data + validation. The README backup-paths section (stale restic/secondary) is rewritten when Tier 2 lands. + ### v0.52.0 — Phase 1 GATE: deploy-side double-nest fix + path-agreement lock (2026-06-13) Completes the Model-A double-nest reconciliation deferred in v0.48.0. v0.51.0 fixed the **backup diff --git a/controller/cmd/controller/main.go b/controller/cmd/controller/main.go index 834e10b..7c887a9 100644 --- a/controller/cmd/controller/main.go +++ b/controller/cmd/controller/main.go @@ -222,6 +222,7 @@ func main() { if cfg.Backup.Enabled { backupMgr = backup.NewManager(cfg, sett, logger) backupMgr.SetStackProvider(stackProv) + backupMgr.SetVersion(Version) } // --- Initialize alert manager --- @@ -851,6 +852,58 @@ func (a *stackAdapter) GetStackHDDPath(name string) string { return "" } +// GetStackRecoveryInfo gathers the SECRET-FREE inputs for an app's recovery unit (Phase 2): the +// stack dir, pinned image tags, the non-secret env, and the NAMES of secret/data-key env vars. +// It deliberately does NOT decrypt or return any secret value — secret/password fields are stored +// encrypted in app.yaml, so excluding them (plus a defensive crypto.IsEncrypted guard) yields a +// plaintext, secret-free env. The actual secret values are recovered at restore time from the +// guest's own app.yaml (live, or via the PBS whole-guest snapshot), never from the unit. +func (a *stackAdapter) GetStackRecoveryInfo(name string) (backup.RecoveryInfo, bool) { + s, ok := a.mgr.GetStack(name) + if !ok { + return backup.RecoveryInfo{}, false + } + stackDir := filepath.Dir(s.ComposePath) + meta := stacks.LoadMetadata(stackDir) + + // Secret set = all secret/password fields ∪ any data_key fields (in deterministic metadata order). + secretSet := make(map[string]bool) + var secretNames []string + add := func(v string) { + if !secretSet[v] { + secretSet[v] = true + secretNames = append(secretNames, v) + } + } + for _, v := range stacks.SensitiveEnvVars(&meta) { + add(v) + } + dataKeys := meta.DataKeyEnvVars() + for _, v := range dataKeys { + add(v) + } + + // Non-secret env: raw app.yaml values that are neither named-secret nor (defensively) encrypted. + nonSecret := make(map[string]string) + if appCfg := stacks.LoadAppConfig(stackDir); appCfg != nil { + for k, v := range appCfg.Env { + if secretSet[k] || crypto.IsEncrypted(v) { + continue + } + nonSecret[k] = v + } + } + + return backup.RecoveryInfo{ + StackDir: stackDir, + DisplayName: s.Meta.DisplayName, + ImagePins: backup.ParseComposeImages(s.ComposePath), + NonSecretEnv: nonSecret, + SecretEnvVars: secretNames, + DataKeyEnvVars: dataKeys, + }, true +} + // RefreshAndIsRunning forces a docker ps scan before checking state. // Called during post-restore health check (~every 5s for up to 90s). // Full refresh is acceptable here since restores are rare operations. diff --git a/controller/internal/appbackup/appdata.go b/controller/internal/appbackup/appdata.go index f446fd4..1d843e4 100644 --- a/controller/internal/appbackup/appdata.go +++ b/controller/internal/appbackup/appdata.go @@ -1,6 +1,7 @@ package appbackup import ( + "bufio" "context" "fmt" "log" @@ -23,6 +24,55 @@ type StackDataProvider interface { StopStack(name string) error StartStack(name string) error RefreshAndIsRunning(name string) bool + // GetStackRecoveryInfo returns the data needed to capture a SECRET-FREE recovery unit + // (Phase 2): the stack dir, pinned image tags, the non-secret env, and the NAMES of the + // secret/data-key env vars (values are NEVER returned — they are recovered at restore time + // from the guest's own app.yaml, live or via the PBS whole-guest snapshot). ok=false if the + // stack is unknown. + GetStackRecoveryInfo(name string) (RecoveryInfo, bool) +} + +// RecoveryInfo carries everything needed to write a secret-free recovery unit for a stack. +// It deliberately holds NO secret values — only the names of secret/data-key env vars, so the +// manifest can record what must be recovered from elsewhere (guest app.yaml / PBS) without the +// unit ever storing a secret or a data-encrypting key. +type RecoveryInfo struct { + StackDir string // dir holding docker-compose.yml + .felhom.yml + app.yaml + DisplayName string // app display name + ImagePins []string // pinned image tags from compose `image:` lines (re-pulled on restore) + NonSecretEnv map[string]string // env with all secret/password/data-key values removed (plaintext only) + SecretEnvVars []string // NAMES of stripped secret/password fields (recovered from guest/PBS) + DataKeyEnvVars []string // NAMES of data-encrypting-key fields (fail-closed gate on restore) +} + +// ParseComposeImages extracts the pinned image references (`image: repo:tag`) from a +// docker-compose.yml, in file order, de-duplicated. The image bytes are never stored in the +// recovery unit — only these pins, so restore re-pulls from the registry. +func ParseComposeImages(composePath string) []string { + data, err := os.ReadFile(composePath) + if err != nil { + return nil + } + var images []string + seen := make(map[string]bool) + scanner := bufio.NewScanner(strings.NewReader(string(data))) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if !strings.HasPrefix(line, "image:") { + continue + } + img := strings.TrimSpace(strings.TrimPrefix(line, "image:")) + img = strings.Trim(img, "\"'") + // Skip variable-only images we can't pin (e.g. image: ${SOME_IMAGE}) + if img == "" || strings.HasPrefix(img, "${") { + continue + } + if !seen[img] { + seen[img] = true + images = append(images, img) + } + } + return images } // StackSummary holds minimal stack info needed for app data discovery. diff --git a/controller/internal/appbackup/paths.go b/controller/internal/appbackup/paths.go index cda8a34..aa0ecfe 100644 --- a/controller/internal/appbackup/paths.go +++ b/controller/internal/appbackup/paths.go @@ -33,14 +33,35 @@ func PrimaryBackupPath(nsRoot string) string { return filepath.Join(nsRoot, "backups", "primary") } +// RecoveryUnitPath returns the per-app self-contained recovery-unit ROOT under a namespace root. +// It is the existing per-app backup dir (`backups/primary//`) — the legacy name is kept so the +// db-dumps/ and volume-dumps/ already written there need no migration; the unit gains compose/ and +// manifest.json as siblings, making the whole dir a complete, recreatable unit (Phase 2). The unit is +// secret-free: secrets/data-keys are recovered from the guest's own app.yaml (live or via PBS), never +// stored here. See backup.recoveryUnit / restore for the capture + restore flow. +func RecoveryUnitPath(nsRoot, stackName string) string { + return filepath.Join(nsRoot, "backups", "primary", stackName) +} + +// RecoveryUnitComposePath returns the compose/config capture dir within an app's recovery unit +// (docker-compose.yml + .felhom.yml + secret-stripped app.yaml). +func RecoveryUnitComposePath(nsRoot, stackName string) string { + return filepath.Join(RecoveryUnitPath(nsRoot, stackName), "compose") +} + +// RecoveryUnitManifestPath returns the manifest.json path within an app's recovery unit. +func RecoveryUnitManifestPath(nsRoot, stackName string) string { + return filepath.Join(RecoveryUnitPath(nsRoot, stackName), "manifest.json") +} + // AppDBDumpPath returns the DB dump directory for an app under a felhom-data namespace root. func AppDBDumpPath(nsRoot, stackName string) string { - return filepath.Join(nsRoot, "backups", "primary", stackName, "db-dumps") + return filepath.Join(RecoveryUnitPath(nsRoot, stackName), "db-dumps") } // AppVolumeDumpPath returns the Docker-volume dump-tar directory for an app under a namespace root. func AppVolumeDumpPath(nsRoot, stackName string) string { - return filepath.Join(nsRoot, "backups", "primary", stackName, "volume-dumps") + return filepath.Join(RecoveryUnitPath(nsRoot, stackName), "volume-dumps") } // AppDataDir returns the app data directory under a felhom-data namespace root. diff --git a/controller/internal/backup/appbackup_bridge.go b/controller/internal/backup/appbackup_bridge.go index 48281ba..c5e3243 100644 --- a/controller/internal/backup/appbackup_bridge.go +++ b/controller/internal/backup/appbackup_bridge.go @@ -25,6 +25,7 @@ type StackSummary = appbackup.StackSummary type AppBackupInfo = appbackup.AppBackupInfo type AppDataPath = appbackup.AppDataPath type AppDockerVolume = appbackup.AppDockerVolume +type RecoveryInfo = appbackup.RecoveryInfo // --- type aliases (dbdump) --- @@ -80,6 +81,10 @@ func ResolveDockerVolumeNames(composePath string) []string { return appbackup.ResolveDockerVolumeNames(composePath) } +func ParseComposeImages(composePath string) []string { + return appbackup.ParseComposeImages(composePath) +} + // humanizeBytes forwards to appbackup.HumanizeBytes; kept unexported so the // many in-package call sites (backup.go, crossdrive.go, restore code) need no edit. func humanizeBytes(b int64) string { @@ -107,6 +112,18 @@ func AppVolumeDumpPath(nsRoot, stackName string) string { return appbackup.AppVolumeDumpPath(nsRoot, stackName) } +func RecoveryUnitPath(nsRoot, stackName string) string { + return appbackup.RecoveryUnitPath(nsRoot, stackName) +} + +func RecoveryUnitComposePath(nsRoot, stackName string) string { + return appbackup.RecoveryUnitComposePath(nsRoot, stackName) +} + +func RecoveryUnitManifestPath(nsRoot, stackName string) string { + return appbackup.RecoveryUnitManifestPath(nsRoot, stackName) +} + func AppDataDir(nsRoot, stackName string) string { return appbackup.AppDataDir(nsRoot, stackName) } diff --git a/controller/internal/backup/backup.go b/controller/internal/backup/backup.go index 5e76c25..96dd707 100644 --- a/controller/internal/backup/backup.go +++ b/controller/internal/backup/backup.go @@ -26,6 +26,7 @@ type Manager struct { settings *settings.Settings stackProvider StackDataProvider systemDataPath string // fallback drive for SSD-only apps + version string // controller version, stamped into recovery-unit manifests mu sync.Mutex lastDBDump *DBDumpStatus @@ -235,9 +236,16 @@ func (m *Manager) runDBDumpsInternal(ctx context.Context) error { m.logger.Printf("[INFO] [backup] DB dump completed: %d databases, %s total (%s)", len(results), humanizeBytes(totalSize), duration.Round(time.Millisecond)) } else { - return fmt.Errorf("some database dumps failed") + // Still refresh recovery units below — a partial DB failure shouldn't leave units stale. + m.logger.Printf("[WARN] [backup] some database dumps failed; refreshing recovery units anyway") } + // Phase 2: refresh each deployed app's self-contained recovery unit (compose + manifest). + m.captureAllRecoveryUnits() + + if !allOK { + return fmt.Errorf("some database dumps failed") + } return nil } diff --git a/controller/internal/backup/recovery_unit.go b/controller/internal/backup/recovery_unit.go new file mode 100644 index 0000000..0fd772c --- /dev/null +++ b/controller/internal/backup/recovery_unit.go @@ -0,0 +1,236 @@ +package backup + +import ( + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "os" + "path/filepath" + "sort" + "strings" + "time" + + "gopkg.in/yaml.v3" +) + +// RecoveryManifest describes an app's self-contained, SECRET-FREE recovery unit (Phase 2). +// +// The unit on a drive is `/backups/primary//` and contains: +// compose/ docker-compose.yml + .felhom.yml + a SECRET-STRIPPED app.yaml +// db-dumps/ app-consistent DB dump(s) (written by the dump flow) +// volume-dumps/ named-volume tars (written by the dump flow) +// manifest.json this file +// +// The unit holds NO secret values, NO data-encrypting keys, and NOT the Docker image — only the +// pinned image tag(s) (re-pulled on restore) and the NAMES of the secret/data-key env vars. The +// secret values are recovered at restore time from the guest's own app.yaml (live on the rootfs, +// or via the PBS whole-guest snapshot) — see Restore. "Restore from the unit alone" is therefore +// honestly "unit + the guest's app.yaml"; SecretSource records that dependency explicitly. +type RecoveryManifest struct { + SchemaVersion int `json:"schema_version"` + AppName string `json:"app_name"` + DisplayName string `json:"display_name"` + ControllerVer string `json:"controller_version"` + CreatedAt string `json:"created_at"` + Drive string `json:"drive"` // HDD_PATH (in-guest mount) + NamespaceRoot string `json:"namespace_root"` // resolved felhom-data namespace root + ImagePins []string `json:"image_pins"` // image NOT stored — re-pulled on restore + SecretEnvVars []string `json:"secret_env_vars"` // NAMES only — recovered from guest/PBS + DataKeyEnvVars []string `json:"data_key_env_vars"` // fail-closed gate on restore + SecretSource string `json:"secret_source"` // human note: where secrets come from + ConfigFiles []string `json:"config_files"` // captured into compose/ + DBDumps []string `json:"db_dumps"` + VolumeDumps []string `json:"volume_dumps"` + Checksums map[string]string `json:"checksums"` // sha256 of captured compose/ files +} + +// SetVersion records the controller version stamped into recovery-unit manifests. +func (m *Manager) SetVersion(v string) { + m.mu.Lock() + m.version = v + m.mu.Unlock() +} + +// CaptureRecoveryUnit writes/refreshes an app's secret-free recovery unit: it captures the +// compose + metadata + a secret-stripped app.yaml into compose/, enumerates the DB/volume dumps +// already present, and writes manifest.json. It NEVER writes a secret value or the Docker image. +func (m *Manager) CaptureRecoveryUnit(stackName string) error { + if m.stackProvider == nil { + return fmt.Errorf("no stack provider") + } + info, ok := m.stackProvider.GetStackRecoveryInfo(stackName) + if !ok { + return fmt.Errorf("stack %q not found", stackName) + } + drivePath := m.GetAppDrivePath(stackName) + if drivePath == "" || !filepath.IsAbs(drivePath) { + return fmt.Errorf("cannot determine absolute drive path for %s", stackName) + } + nsRoot := m.namespaceRoot(drivePath) + composeDir := RecoveryUnitComposePath(nsRoot, stackName) + if err := os.MkdirAll(composeDir, 0755); err != nil { + return fmt.Errorf("creating recovery-unit compose dir: %w", err) + } + + checksums := make(map[string]string) + var configFiles []string + + // Capture docker-compose.yml + .felhom.yml verbatim (whichever exist). + for _, fname := range []string{"docker-compose.yml", ".felhom.yml"} { + src := filepath.Join(info.StackDir, fname) + if _, err := os.Stat(src); err != nil { + continue + } + sum, err := copyFileChecksum(src, filepath.Join(composeDir, fname)) + if err != nil { + return fmt.Errorf("capturing %s: %w", fname, err) + } + checksums[fname] = sum + configFiles = append(configFiles, fname) + } + + // Write the SECRET-STRIPPED app.yaml (non-secret env only). + sum, err := writeStrippedAppYaml(filepath.Join(composeDir, "app.yaml"), info) + if err != nil { + return fmt.Errorf("writing stripped app.yaml: %w", err) + } + checksums["app.yaml"] = sum + configFiles = append(configFiles, "app.yaml") + + manifest := &RecoveryManifest{ + SchemaVersion: 1, + AppName: stackName, + DisplayName: info.DisplayName, + ControllerVer: m.versionLocked(), + CreatedAt: time.Now().UTC().Format(time.RFC3339), + Drive: drivePath, + NamespaceRoot: nsRoot, + ImagePins: info.ImagePins, + SecretEnvVars: info.SecretEnvVars, + DataKeyEnvVars: info.DataKeyEnvVars, + SecretSource: "guest app.yaml (live rootfs) or PBS whole-guest snapshot — never stored in this unit", + ConfigFiles: configFiles, + DBDumps: listFileNames(AppDBDumpPath(nsRoot, stackName), ".sql"), + VolumeDumps: listFileNames(AppVolumeDumpPath(nsRoot, stackName), ".tar"), + Checksums: checksums, + } + if err := writeManifest(RecoveryUnitManifestPath(nsRoot, stackName), manifest); err != nil { + return fmt.Errorf("writing manifest: %w", err) + } + + m.logger.Printf("[INFO] [backup] Recovery unit captured for %s → %s (images=%d, secrets-referenced=%d, data_keys=%d)", + stackName, RecoveryUnitPath(nsRoot, stackName), len(info.ImagePins), len(info.SecretEnvVars), len(info.DataKeyEnvVars)) + return nil +} + +// captureAllRecoveryUnits refreshes the recovery unit for every deployed stack. Best-effort: +// a per-app failure is logged and does not abort the others. +func (m *Manager) captureAllRecoveryUnits() { + if m.stackProvider == nil { + return + } + for _, stack := range m.stackProvider.ListDeployedStacks() { + drivePath := m.GetAppDrivePath(stack.Name) + if m.settings != nil && (m.settings.IsDisconnected(drivePath) || m.settings.IsDecommissioned(drivePath)) { + continue // drive not writable — skip, the existing unit stays as-is + } + if err := m.CaptureRecoveryUnit(stack.Name); err != nil { + m.logger.Printf("[WARN] [backup] Recovery unit capture failed for %s: %v", stack.Name, err) + } + } +} + +func (m *Manager) versionLocked() string { + m.mu.Lock() + defer m.mu.Unlock() + return m.version +} + +// strippedAppYaml is the on-disk shape of the secret-free app.yaml captured into the unit. +type strippedAppYaml struct { + Deployed bool `yaml:"deployed"` + Env map[string]string `yaml:"env"` +} + +// writeStrippedAppYaml writes a secret-free app.yaml (non-secret env only) and returns its sha256. +func writeStrippedAppYaml(dst string, info RecoveryInfo) (string, error) { + body, err := yaml.Marshal(strippedAppYaml{Deployed: true, Env: info.NonSecretEnv}) + if err != nil { + return "", err + } + header := "# Captured by felhom-controller recovery unit — SECRET-FREE.\n" + + "# Secret/data-key values are intentionally omitted; recover them at restore from the\n" + + "# guest's own app.yaml (live rootfs, or the PBS whole-guest snapshot). Stripped names:\n" + if len(info.SecretEnvVars) > 0 { + header += "# " + strings.Join(info.SecretEnvVars, ", ") + "\n" + } + content := []byte(header + string(body)) + if err := atomicWrite(dst, content, 0600); err != nil { + return "", err + } + sum := sha256.Sum256(content) + return hex.EncodeToString(sum[:]), nil +} + +// writeManifest writes the manifest JSON atomically. +func writeManifest(dst string, manifest *RecoveryManifest) error { + data, err := json.MarshalIndent(manifest, "", " ") + if err != nil { + return err + } + return atomicWrite(dst, append(data, '\n'), 0644) +} + +// copyFileChecksum copies src→dst and returns the sha256 of the copied bytes. +func copyFileChecksum(src, dst string) (string, error) { + data, err := os.ReadFile(src) + if err != nil { + return "", err + } + if err := atomicWrite(dst, data, 0644); err != nil { + return "", err + } + sum := sha256.Sum256(data) + return hex.EncodeToString(sum[:]), nil +} + +// listFileNames returns the names of files with the given suffix in dir (sorted, none if absent). +func listFileNames(dir, suffix string) []string { + entries, err := os.ReadDir(dir) + if err != nil { + return nil + } + var names []string + for _, e := range entries { + if !e.IsDir() && strings.HasSuffix(e.Name(), suffix) { + names = append(names, e.Name()) + } + } + sort.Strings(names) + return names +} + +// atomicWrite writes data to path via a .tmp file + rename. +func atomicWrite(path string, data []byte, perm os.FileMode) error { + tmp := path + ".tmp" + f, err := os.OpenFile(tmp, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, perm) + if err != nil { + return err + } + if _, err := io.Copy(f, strings.NewReader(string(data))); err != nil { + f.Close() + os.Remove(tmp) + return err + } + if err := f.Close(); err != nil { + os.Remove(tmp) + return err + } + if err := os.Rename(tmp, path); err != nil { + os.Remove(tmp) + return err + } + return nil +} diff --git a/controller/internal/backup/recovery_unit_test.go b/controller/internal/backup/recovery_unit_test.go new file mode 100644 index 0000000..dfc1aa6 --- /dev/null +++ b/controller/internal/backup/recovery_unit_test.go @@ -0,0 +1,146 @@ +package backup + +import ( + "encoding/json" + "io" + "io/fs" + "log" + "os" + "path/filepath" + "strings" + "testing" +) + +// fakeRecoveryProvider is a minimal StackDataProvider for the capture test. +type fakeRecoveryProvider struct { + info RecoveryInfo + hdd string +} + +func (f *fakeRecoveryProvider) GetStackComposePath(string) (string, bool) { + return filepath.Join(f.info.StackDir, "docker-compose.yml"), true +} +func (f *fakeRecoveryProvider) ListDeployedStacks() []StackSummary { return nil } +func (f *fakeRecoveryProvider) GetStackHDDMounts(string) []string { return nil } +func (f *fakeRecoveryProvider) GetStackHDDPath(string) string { return f.hdd } +func (f *fakeRecoveryProvider) GetDockerVolumes(string) []string { return nil } +func (f *fakeRecoveryProvider) StopStack(string) error { return nil } +func (f *fakeRecoveryProvider) StartStack(string) error { return nil } +func (f *fakeRecoveryProvider) RefreshAndIsRunning(string) bool { return false } +func (f *fakeRecoveryProvider) GetStackRecoveryInfo(string) (RecoveryInfo, bool) { + return f.info, true +} + +// TestCaptureRecoveryUnitIsSecretFree proves the captured unit (a) contains compose+config+manifest, +// (b) enumerates the existing dumps, and (c) is SECRET-FREE: a secret value present in the SOURCE +// app.yaml does NOT appear anywhere in the unit, because the capture writes the stripped NonSecretEnv +// (not the raw app.yaml). The manifest records the secret NAMES + data_key flag for recovery-from-guest. +func TestCaptureRecoveryUnitIsSecretFree(t *testing.T) { + const secretVal = "SUPERSECRETVALUE-do-not-leak" + tmp := t.TempDir() + stackDir := filepath.Join(tmp, "stack") + drive := filepath.Join(tmp, "drive") // in-guest namespace root (basename need not be felhom-data) + if err := os.MkdirAll(stackDir, 0755); err != nil { + t.Fatal(err) + } + + // Source stack files — the raw app.yaml DELIBERATELY holds a secret to prove it's not copied. + mustWrite(t, filepath.Join(stackDir, "docker-compose.yml"), + "services:\n app:\n image: example/app:1.2.3\n") + mustWrite(t, filepath.Join(stackDir, ".felhom.yml"), "display_name: Example\n") + mustWrite(t, filepath.Join(stackDir, "app.yaml"), + "deployed: true\nenv:\n DB_PASSWORD: "+secretVal+"\n SUBDOMAIN: example\n") + + // Pre-existing dumps (written by the dump flow before capture). + mustWrite(t, filepath.Join(AppDBDumpPath(drive, "example"), "example-postgres.sql"), "dump") + mustWrite(t, filepath.Join(AppVolumeDumpPath(drive, "example"), "example_data.tar"), "tar") + + // RecoveryInfo as the adapter would build it: secret values already stripped from NonSecretEnv. + info := RecoveryInfo{ + StackDir: stackDir, + DisplayName: "Example", + ImagePins: []string{"example/app:1.2.3"}, + NonSecretEnv: map[string]string{"SUBDOMAIN": "example", "HDD_PATH": drive}, + SecretEnvVars: []string{"DB_PASSWORD", "SECRET_KEY"}, + DataKeyEnvVars: []string{"SECRET_KEY"}, + } + m := &Manager{ + logger: log.New(io.Discard, "", 0), + systemDataPath: filepath.Join(tmp, "system"), // != drive ⇒ drive treated as in-guest, nsRoot = drive + stackProvider: &fakeRecoveryProvider{info: info, hdd: drive}, + version: "vtest", + } + + if err := m.CaptureRecoveryUnit("example"); err != nil { + t.Fatalf("capture: %v", err) + } + + composeDir := RecoveryUnitComposePath(drive, "example") + for _, f := range []string{"docker-compose.yml", ".felhom.yml", "app.yaml"} { + if _, err := os.Stat(filepath.Join(composeDir, f)); err != nil { + t.Errorf("missing captured config %s: %v", f, err) + } + } + + // Manifest structure. + mfData, err := os.ReadFile(RecoveryUnitManifestPath(drive, "example")) + if err != nil { + t.Fatalf("manifest: %v", err) + } + var man RecoveryManifest + if err := json.Unmarshal(mfData, &man); err != nil { + t.Fatalf("manifest parse: %v", err) + } + if man.AppName != "example" || man.ControllerVer != "vtest" { + t.Errorf("manifest meta: app=%q ver=%q", man.AppName, man.ControllerVer) + } + if len(man.ImagePins) != 1 || man.ImagePins[0] != "example/app:1.2.3" { + t.Errorf("image pins: %v", man.ImagePins) + } + if len(man.SecretEnvVars) != 2 { + t.Errorf("secret env-var names: %v (want 2)", man.SecretEnvVars) + } + if len(man.DataKeyEnvVars) != 1 || man.DataKeyEnvVars[0] != "SECRET_KEY" { + t.Errorf("data-key env-vars: %v", man.DataKeyEnvVars) + } + if len(man.DBDumps) != 1 || len(man.VolumeDumps) != 1 { + t.Errorf("dumps enumerated: db=%v vol=%v", man.DBDumps, man.VolumeDumps) + } + + // app.yaml in the unit must carry the non-secret env but NOT the secret value. + appy := mustRead(t, filepath.Join(composeDir, "app.yaml")) + if !strings.Contains(appy, "SUBDOMAIN") { + t.Errorf("stripped app.yaml missing non-secret env: %s", appy) + } + + // SECRET-FREE invariant: the secret value must not appear ANYWHERE in the unit. + unitRoot := RecoveryUnitPath(drive, "example") + _ = filepath.WalkDir(unitRoot, func(path string, d fs.DirEntry, err error) error { + if err != nil || d.IsDir() { + return nil + } + if strings.Contains(mustRead(t, path), secretVal) { + t.Errorf("SECRET LEAK: %q found in %s", secretVal, path) + } + return nil + }) +} + +func mustWrite(t *testing.T, path, content string) { + t.Helper() + if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(path, []byte(content), 0644); err != nil { + t.Fatal(err) + } +} + +func mustRead(t *testing.T, path string) string { + t.Helper() + b, err := os.ReadFile(path) + if err != nil { + t.Fatal(err) + } + return string(b) +} diff --git a/controller/internal/stacks/metadata.go b/controller/internal/stacks/metadata.go index 0f4bae8..c5055c4 100644 --- a/controller/internal/stacks/metadata.go +++ b/controller/internal/stacks/metadata.go @@ -71,6 +71,23 @@ type DeployField struct { Description string `yaml:"description" json:"description"` LockedAfterDeploy bool `yaml:"locked_after_deploy" json:"locked_after_deploy"` Options []SelectOption `yaml:"options" json:"options,omitempty"` + // DataKey marks a field as a DATA-ENCRYPTING key (e.g. AdventureLog's "Titkosítási kulcs"): + // the app encrypts stored data with it, so regenerating it would render restored data + // unreadable. It is a fail-closed annotation only — the recovery unit never stores secrets; + // at restore the controller refuses (rather than silently restoring garbage) if a data_key + // app's key cannot be recovered from the guest's app.yaml (live or via PBS). See Phase 2. + DataKey bool `yaml:"data_key,omitempty" json:"data_key,omitempty"` +} + +// DataKeyEnvVars returns the env-var names of fields marked data_key:true. +func (m *Metadata) DataKeyEnvVars() []string { + var out []string + for _, f := range m.DeployFields { + if f.DataKey { + out = append(out, f.EnvVar) + } + } + return out } // SelectOption is a choice for "select" type fields.