package backup import ( "crypto/sha256" "encoding/hex" "encoding/json" "fmt" "io" "os" "path/filepath" "sort" "strings" "time" "gopkg.in/yaml.v3" ) // RecoveryManifest describes an app's self-contained, SECRET-FREE recovery unit (Phase 2). // // The unit on a drive is `/backups/primary//` and contains: // compose/ docker-compose.yml + .felhom.yml + a SECRET-STRIPPED app.yaml // db-dumps/ app-consistent DB dump(s) (written by the dump flow) // volume-dumps/ named-volume tars (written by the dump flow) // manifest.json this file // // The unit holds NO secret values, NO data-encrypting keys, and NOT the Docker image — only the // pinned image tag(s) (re-pulled on restore) and the NAMES of the secret/data-key env vars. The // secret values are recovered at restore time from the guest's own app.yaml (live on the rootfs, // or via the PBS whole-guest snapshot) — see Restore. "Restore from the unit alone" is therefore // honestly "unit + the guest's app.yaml"; SecretSource records that dependency explicitly. type RecoveryManifest struct { SchemaVersion int `json:"schema_version"` AppName string `json:"app_name"` DisplayName string `json:"display_name"` ControllerVer string `json:"controller_version"` CreatedAt string `json:"created_at"` Drive string `json:"drive"` // HDD_PATH (in-guest mount) NamespaceRoot string `json:"namespace_root"` // resolved felhom-data namespace root ImagePins []string `json:"image_pins"` // image NOT stored — re-pulled on restore SecretEnvVars []string `json:"secret_env_vars"` // NAMES only — recovered from guest/PBS DataKeyEnvVars []string `json:"data_key_env_vars"` // fail-closed gate on restore SecretSource string `json:"secret_source"` // human note: where secrets come from ConfigFiles []string `json:"config_files"` // captured into compose/ DBDumps []string `json:"db_dumps"` VolumeDumps []string `json:"volume_dumps"` Checksums map[string]string `json:"checksums"` // sha256 of captured compose/ files } // SetVersion records the controller version stamped into recovery-unit manifests. func (m *Manager) SetVersion(v string) { m.mu.Lock() m.version = v m.mu.Unlock() } // CaptureRecoveryUnit writes/refreshes an app's secret-free recovery unit: it captures the // compose + metadata + a secret-stripped app.yaml into compose/, enumerates the DB/volume dumps // already present, and writes manifest.json. It NEVER writes a secret value or the Docker image. func (m *Manager) CaptureRecoveryUnit(stackName string) error { if m.stackProvider == nil { return fmt.Errorf("no stack provider") } info, ok := m.stackProvider.GetStackRecoveryInfo(stackName) if !ok { return fmt.Errorf("stack %q not found", stackName) } drivePath := m.GetAppDrivePath(stackName) if drivePath == "" || !filepath.IsAbs(drivePath) { return fmt.Errorf("cannot determine absolute drive path for %s", stackName) } nsRoot := m.namespaceRoot(drivePath) composeDir := RecoveryUnitComposePath(nsRoot, stackName) if err := os.MkdirAll(composeDir, 0755); err != nil { return fmt.Errorf("creating recovery-unit compose dir: %w", err) } checksums := make(map[string]string) var configFiles []string // Capture docker-compose.yml + .felhom.yml verbatim (whichever exist). for _, fname := range []string{"docker-compose.yml", ".felhom.yml"} { src := filepath.Join(info.StackDir, fname) if _, err := os.Stat(src); err != nil { continue } sum, err := copyFileChecksum(src, filepath.Join(composeDir, fname)) if err != nil { return fmt.Errorf("capturing %s: %w", fname, err) } checksums[fname] = sum configFiles = append(configFiles, fname) } // Write the SECRET-STRIPPED app.yaml (non-secret env only). sum, err := writeStrippedAppYaml(filepath.Join(composeDir, "app.yaml"), info) if err != nil { return fmt.Errorf("writing stripped app.yaml: %w", err) } checksums["app.yaml"] = sum configFiles = append(configFiles, "app.yaml") manifest := &RecoveryManifest{ SchemaVersion: 1, AppName: stackName, DisplayName: info.DisplayName, ControllerVer: m.versionLocked(), CreatedAt: time.Now().UTC().Format(time.RFC3339), Drive: drivePath, NamespaceRoot: nsRoot, ImagePins: info.ImagePins, SecretEnvVars: info.SecretEnvVars, DataKeyEnvVars: info.DataKeyEnvVars, SecretSource: "guest app.yaml (live rootfs) or PBS whole-guest snapshot — never stored in this unit", ConfigFiles: configFiles, DBDumps: listFileNames(AppDBDumpPath(nsRoot, stackName), ".sql"), VolumeDumps: listFileNames(AppVolumeDumpPath(nsRoot, stackName), ".tar"), Checksums: checksums, } if err := writeManifest(RecoveryUnitManifestPath(nsRoot, stackName), manifest); err != nil { return fmt.Errorf("writing manifest: %w", err) } m.logger.Printf("[INFO] [backup] Recovery unit captured for %s → %s (images=%d, secrets-referenced=%d, data_keys=%d)", stackName, RecoveryUnitPath(nsRoot, stackName), len(info.ImagePins), len(info.SecretEnvVars), len(info.DataKeyEnvVars)) return nil } // captureAllRecoveryUnits refreshes the recovery unit for every deployed stack. Best-effort: // a per-app failure is logged and does not abort the others. func (m *Manager) captureAllRecoveryUnits() { if m.stackProvider == nil { return } for _, stack := range m.stackProvider.ListDeployedStacks() { drivePath := m.GetAppDrivePath(stack.Name) if m.settings != nil && (m.settings.IsDisconnected(drivePath) || m.settings.IsDecommissioned(drivePath)) { continue // drive not writable — skip, the existing unit stays as-is } if err := m.CaptureRecoveryUnit(stack.Name); err != nil { m.logger.Printf("[WARN] [backup] Recovery unit capture failed for %s: %v", stack.Name, err) } } } func (m *Manager) versionLocked() string { m.mu.Lock() defer m.mu.Unlock() return m.version } // strippedAppYaml is the on-disk shape of the secret-free app.yaml captured into the unit. type strippedAppYaml struct { Deployed bool `yaml:"deployed"` Env map[string]string `yaml:"env"` } // writeStrippedAppYaml writes a secret-free app.yaml (non-secret env only) and returns its sha256. func writeStrippedAppYaml(dst string, info RecoveryInfo) (string, error) { body, err := yaml.Marshal(strippedAppYaml{Deployed: true, Env: info.NonSecretEnv}) if err != nil { return "", err } header := "# Captured by felhom-controller recovery unit — SECRET-FREE.\n" + "# Secret/data-key values are intentionally omitted; recover them at restore from the\n" + "# guest's own app.yaml (live rootfs, or the PBS whole-guest snapshot). Stripped names:\n" if len(info.SecretEnvVars) > 0 { header += "# " + strings.Join(info.SecretEnvVars, ", ") + "\n" } content := []byte(header + string(body)) if err := atomicWrite(dst, content, 0600); err != nil { return "", err } sum := sha256.Sum256(content) return hex.EncodeToString(sum[:]), nil } // writeManifest writes the manifest JSON atomically. func writeManifest(dst string, manifest *RecoveryManifest) error { data, err := json.MarshalIndent(manifest, "", " ") if err != nil { return err } return atomicWrite(dst, append(data, '\n'), 0644) } // copyFileChecksum copies src→dst and returns the sha256 of the copied bytes. func copyFileChecksum(src, dst string) (string, error) { data, err := os.ReadFile(src) if err != nil { return "", err } if err := atomicWrite(dst, data, 0644); err != nil { return "", err } sum := sha256.Sum256(data) return hex.EncodeToString(sum[:]), nil } // listFileNames returns the names of files with the given suffix in dir (sorted, none if absent). func listFileNames(dir, suffix string) []string { entries, err := os.ReadDir(dir) if err != nil { return nil } var names []string for _, e := range entries { if !e.IsDir() && strings.HasSuffix(e.Name(), suffix) { names = append(names, e.Name()) } } sort.Strings(names) return names } // atomicWrite writes data to path via a .tmp file + rename. func atomicWrite(path string, data []byte, perm os.FileMode) error { tmp := path + ".tmp" f, err := os.OpenFile(tmp, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, perm) if err != nil { return err } if _, err := io.Copy(f, strings.NewReader(string(data))); err != nil { f.Close() os.Remove(tmp) return err } if err := f.Close(); err != nil { os.Remove(tmp) return err } if err := os.Rename(tmp, path); err != nil { os.Remove(tmp) return err } return nil }