70eb521cd0
Each app's on-drive backup becomes a self-contained, recreatable recovery unit: compose/ (docker-compose.yml + .felhom.yml + secret-stripped app.yaml) alongside the existing db-dumps/ + volume-dumps/, plus a secret-free manifest.json (image pins, secret env-var NAMES, data_key names, checksums). The unit stores no secret value, no data-key, and not the image — secrets are recovered at restore from the guest's own app.yaml (live/PBS), never regenerated. - appbackup: RecoveryUnit* path helpers, RecoveryInfo + GetStackRecoveryInfo, ParseComposeImages; AppDBDump/Volume refactored onto RecoveryUnitPath. - backup: recovery_unit.go (manifest + CaptureRecoveryUnit), wired into RunDBDumps; capture test proves secret-free. - stacks: DeployField.DataKey + Metadata.DataKeyEnvVars(); main.go stackAdapter implements GetStackRecoveryInfo (excludes secret-named + encrypted values). - Restore-from-unit recreate + fail-closed gate + live AdventureLog validation: next. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
237 lines
8.5 KiB
Go
237 lines
8.5 KiB
Go
package backup
|
|
|
|
import (
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
"gopkg.in/yaml.v3"
|
|
)
|
|
|
|
// RecoveryManifest describes an app's self-contained, SECRET-FREE recovery unit (Phase 2).
|
|
//
|
|
// The unit on a drive is `<nsRoot>/backups/primary/<app>/` and contains:
|
|
// compose/ docker-compose.yml + .felhom.yml + a SECRET-STRIPPED app.yaml
|
|
// db-dumps/ app-consistent DB dump(s) (written by the dump flow)
|
|
// volume-dumps/ named-volume tars (written by the dump flow)
|
|
// manifest.json this file
|
|
//
|
|
// The unit holds NO secret values, NO data-encrypting keys, and NOT the Docker image — only the
|
|
// pinned image tag(s) (re-pulled on restore) and the NAMES of the secret/data-key env vars. The
|
|
// secret values are recovered at restore time from the guest's own app.yaml (live on the rootfs,
|
|
// or via the PBS whole-guest snapshot) — see Restore. "Restore from the unit alone" is therefore
|
|
// honestly "unit + the guest's app.yaml"; SecretSource records that dependency explicitly.
|
|
type RecoveryManifest struct {
|
|
SchemaVersion int `json:"schema_version"`
|
|
AppName string `json:"app_name"`
|
|
DisplayName string `json:"display_name"`
|
|
ControllerVer string `json:"controller_version"`
|
|
CreatedAt string `json:"created_at"`
|
|
Drive string `json:"drive"` // HDD_PATH (in-guest mount)
|
|
NamespaceRoot string `json:"namespace_root"` // resolved felhom-data namespace root
|
|
ImagePins []string `json:"image_pins"` // image NOT stored — re-pulled on restore
|
|
SecretEnvVars []string `json:"secret_env_vars"` // NAMES only — recovered from guest/PBS
|
|
DataKeyEnvVars []string `json:"data_key_env_vars"` // fail-closed gate on restore
|
|
SecretSource string `json:"secret_source"` // human note: where secrets come from
|
|
ConfigFiles []string `json:"config_files"` // captured into compose/
|
|
DBDumps []string `json:"db_dumps"`
|
|
VolumeDumps []string `json:"volume_dumps"`
|
|
Checksums map[string]string `json:"checksums"` // sha256 of captured compose/ files
|
|
}
|
|
|
|
// SetVersion records the controller version stamped into recovery-unit manifests.
|
|
func (m *Manager) SetVersion(v string) {
|
|
m.mu.Lock()
|
|
m.version = v
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
// CaptureRecoveryUnit writes/refreshes an app's secret-free recovery unit: it captures the
|
|
// compose + metadata + a secret-stripped app.yaml into compose/, enumerates the DB/volume dumps
|
|
// already present, and writes manifest.json. It NEVER writes a secret value or the Docker image.
|
|
func (m *Manager) CaptureRecoveryUnit(stackName string) error {
|
|
if m.stackProvider == nil {
|
|
return fmt.Errorf("no stack provider")
|
|
}
|
|
info, ok := m.stackProvider.GetStackRecoveryInfo(stackName)
|
|
if !ok {
|
|
return fmt.Errorf("stack %q not found", stackName)
|
|
}
|
|
drivePath := m.GetAppDrivePath(stackName)
|
|
if drivePath == "" || !filepath.IsAbs(drivePath) {
|
|
return fmt.Errorf("cannot determine absolute drive path for %s", stackName)
|
|
}
|
|
nsRoot := m.namespaceRoot(drivePath)
|
|
composeDir := RecoveryUnitComposePath(nsRoot, stackName)
|
|
if err := os.MkdirAll(composeDir, 0755); err != nil {
|
|
return fmt.Errorf("creating recovery-unit compose dir: %w", err)
|
|
}
|
|
|
|
checksums := make(map[string]string)
|
|
var configFiles []string
|
|
|
|
// Capture docker-compose.yml + .felhom.yml verbatim (whichever exist).
|
|
for _, fname := range []string{"docker-compose.yml", ".felhom.yml"} {
|
|
src := filepath.Join(info.StackDir, fname)
|
|
if _, err := os.Stat(src); err != nil {
|
|
continue
|
|
}
|
|
sum, err := copyFileChecksum(src, filepath.Join(composeDir, fname))
|
|
if err != nil {
|
|
return fmt.Errorf("capturing %s: %w", fname, err)
|
|
}
|
|
checksums[fname] = sum
|
|
configFiles = append(configFiles, fname)
|
|
}
|
|
|
|
// Write the SECRET-STRIPPED app.yaml (non-secret env only).
|
|
sum, err := writeStrippedAppYaml(filepath.Join(composeDir, "app.yaml"), info)
|
|
if err != nil {
|
|
return fmt.Errorf("writing stripped app.yaml: %w", err)
|
|
}
|
|
checksums["app.yaml"] = sum
|
|
configFiles = append(configFiles, "app.yaml")
|
|
|
|
manifest := &RecoveryManifest{
|
|
SchemaVersion: 1,
|
|
AppName: stackName,
|
|
DisplayName: info.DisplayName,
|
|
ControllerVer: m.versionLocked(),
|
|
CreatedAt: time.Now().UTC().Format(time.RFC3339),
|
|
Drive: drivePath,
|
|
NamespaceRoot: nsRoot,
|
|
ImagePins: info.ImagePins,
|
|
SecretEnvVars: info.SecretEnvVars,
|
|
DataKeyEnvVars: info.DataKeyEnvVars,
|
|
SecretSource: "guest app.yaml (live rootfs) or PBS whole-guest snapshot — never stored in this unit",
|
|
ConfigFiles: configFiles,
|
|
DBDumps: listFileNames(AppDBDumpPath(nsRoot, stackName), ".sql"),
|
|
VolumeDumps: listFileNames(AppVolumeDumpPath(nsRoot, stackName), ".tar"),
|
|
Checksums: checksums,
|
|
}
|
|
if err := writeManifest(RecoveryUnitManifestPath(nsRoot, stackName), manifest); err != nil {
|
|
return fmt.Errorf("writing manifest: %w", err)
|
|
}
|
|
|
|
m.logger.Printf("[INFO] [backup] Recovery unit captured for %s → %s (images=%d, secrets-referenced=%d, data_keys=%d)",
|
|
stackName, RecoveryUnitPath(nsRoot, stackName), len(info.ImagePins), len(info.SecretEnvVars), len(info.DataKeyEnvVars))
|
|
return nil
|
|
}
|
|
|
|
// captureAllRecoveryUnits refreshes the recovery unit for every deployed stack. Best-effort:
|
|
// a per-app failure is logged and does not abort the others.
|
|
func (m *Manager) captureAllRecoveryUnits() {
|
|
if m.stackProvider == nil {
|
|
return
|
|
}
|
|
for _, stack := range m.stackProvider.ListDeployedStacks() {
|
|
drivePath := m.GetAppDrivePath(stack.Name)
|
|
if m.settings != nil && (m.settings.IsDisconnected(drivePath) || m.settings.IsDecommissioned(drivePath)) {
|
|
continue // drive not writable — skip, the existing unit stays as-is
|
|
}
|
|
if err := m.CaptureRecoveryUnit(stack.Name); err != nil {
|
|
m.logger.Printf("[WARN] [backup] Recovery unit capture failed for %s: %v", stack.Name, err)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (m *Manager) versionLocked() string {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
return m.version
|
|
}
|
|
|
|
// strippedAppYaml is the on-disk shape of the secret-free app.yaml captured into the unit.
|
|
type strippedAppYaml struct {
|
|
Deployed bool `yaml:"deployed"`
|
|
Env map[string]string `yaml:"env"`
|
|
}
|
|
|
|
// writeStrippedAppYaml writes a secret-free app.yaml (non-secret env only) and returns its sha256.
|
|
func writeStrippedAppYaml(dst string, info RecoveryInfo) (string, error) {
|
|
body, err := yaml.Marshal(strippedAppYaml{Deployed: true, Env: info.NonSecretEnv})
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
header := "# Captured by felhom-controller recovery unit — SECRET-FREE.\n" +
|
|
"# Secret/data-key values are intentionally omitted; recover them at restore from the\n" +
|
|
"# guest's own app.yaml (live rootfs, or the PBS whole-guest snapshot). Stripped names:\n"
|
|
if len(info.SecretEnvVars) > 0 {
|
|
header += "# " + strings.Join(info.SecretEnvVars, ", ") + "\n"
|
|
}
|
|
content := []byte(header + string(body))
|
|
if err := atomicWrite(dst, content, 0600); err != nil {
|
|
return "", err
|
|
}
|
|
sum := sha256.Sum256(content)
|
|
return hex.EncodeToString(sum[:]), nil
|
|
}
|
|
|
|
// writeManifest writes the manifest JSON atomically.
|
|
func writeManifest(dst string, manifest *RecoveryManifest) error {
|
|
data, err := json.MarshalIndent(manifest, "", " ")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return atomicWrite(dst, append(data, '\n'), 0644)
|
|
}
|
|
|
|
// copyFileChecksum copies src→dst and returns the sha256 of the copied bytes.
|
|
func copyFileChecksum(src, dst string) (string, error) {
|
|
data, err := os.ReadFile(src)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if err := atomicWrite(dst, data, 0644); err != nil {
|
|
return "", err
|
|
}
|
|
sum := sha256.Sum256(data)
|
|
return hex.EncodeToString(sum[:]), nil
|
|
}
|
|
|
|
// listFileNames returns the names of files with the given suffix in dir (sorted, none if absent).
|
|
func listFileNames(dir, suffix string) []string {
|
|
entries, err := os.ReadDir(dir)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
var names []string
|
|
for _, e := range entries {
|
|
if !e.IsDir() && strings.HasSuffix(e.Name(), suffix) {
|
|
names = append(names, e.Name())
|
|
}
|
|
}
|
|
sort.Strings(names)
|
|
return names
|
|
}
|
|
|
|
// atomicWrite writes data to path via a .tmp file + rename.
|
|
func atomicWrite(path string, data []byte, perm os.FileMode) error {
|
|
tmp := path + ".tmp"
|
|
f, err := os.OpenFile(tmp, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, perm)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if _, err := io.Copy(f, strings.NewReader(string(data))); err != nil {
|
|
f.Close()
|
|
os.Remove(tmp)
|
|
return err
|
|
}
|
|
if err := f.Close(); err != nil {
|
|
os.Remove(tmp)
|
|
return err
|
|
}
|
|
if err := os.Rename(tmp, path); err != nil {
|
|
os.Remove(tmp)
|
|
return err
|
|
}
|
|
return nil
|
|
}
|