Files
felhom-controller/controller/internal/backup/recovery_unit.go
T
admin d2071430ea v0.55.0: Phase 3 — auto off-drive Tier 2 (rootfs-headroom guard)
Tier 2 rsync-mirrors each HDD app's recovery unit + appdata to a DIFFERENT physical
disk (the only off-drive protection bind-mounted userdata can get; PBS can't reach it).
Auto-enabled, auto-target: prefer another registered drive (different physical disk via
system.SamePhysicalDevice), else the internal SSD for SMALL units only — with a
size-aware headroom guard that REFUSES rather than fill the ~8G guest rootfs, recording
an honest "needs 2nd HDD" status. Status persisted via the surviving CrossDriveBackup;
"2. mentés" UI card now populated. Daily tier2-backup job + POST /api/backup/tier2.

- backup/tier2.go (engine+selection+headroom), tier2_test.go (headroom arithmetic)
- system.SamePhysicalDevice (linux Stat_t.Dev + stub)
- handlers.go Tier2 UI population + tier2DestLabel; backups.html honest no-target reason
- fixed stale TestBackupCopiesOnPath (old felhom-data layout -> in-guest layout)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-13 13:24:49 +02:00

291 lines
10 KiB
Go

package backup
import (
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"sort"
"strings"
"time"
"gopkg.in/yaml.v3"
)
// RecoveryManifest describes an app's self-contained, SECRET-FREE recovery unit (Phase 2).
//
// The unit on a drive is `<nsRoot>/backups/primary/<app>/` and contains:
// compose/ docker-compose.yml + .felhom.yml + a SECRET-STRIPPED app.yaml
// db-dumps/ app-consistent DB dump(s) (written by the dump flow)
// volume-dumps/ named-volume tars (written by the dump flow)
// manifest.json this file
//
// The unit holds NO secret values, NO data-encrypting keys, and NOT the Docker image — only the
// pinned image tag(s) (re-pulled on restore) and the NAMES of the secret/data-key env vars. The
// secret values are recovered at restore time from the guest's own app.yaml (live on the rootfs,
// or via the PBS whole-guest snapshot) — see Restore. "Restore from the unit alone" is therefore
// honestly "unit + the guest's app.yaml"; SecretSource records that dependency explicitly.
type RecoveryManifest struct {
SchemaVersion int `json:"schema_version"`
AppName string `json:"app_name"`
DisplayName string `json:"display_name"`
ControllerVer string `json:"controller_version"`
CreatedAt string `json:"created_at"`
Drive string `json:"drive"` // HDD_PATH (in-guest mount)
NamespaceRoot string `json:"namespace_root"` // resolved felhom-data namespace root
ImagePins []string `json:"image_pins"` // image NOT stored — re-pulled on restore
SecretEnvVars []string `json:"secret_env_vars"` // NAMES only — recovered from guest/PBS
DataKeyEnvVars []string `json:"data_key_env_vars"` // fail-closed gate on restore
SecretSource string `json:"secret_source"` // human note: where secrets come from
ConfigFiles []string `json:"config_files"` // captured into compose/
DBDumps []string `json:"db_dumps"`
VolumeDumps []string `json:"volume_dumps"`
Checksums map[string]string `json:"checksums"` // sha256 of captured compose/ files
}
// SetVersion records the controller version stamped into recovery-unit manifests.
func (m *Manager) SetVersion(v string) {
m.mu.Lock()
m.version = v
m.mu.Unlock()
}
// SetTier2Notifier wires the notification callback invoked after each Tier 2 copy.
func (m *Manager) SetTier2Notifier(fn func(stackName, destLabel string, dur time.Duration, err error)) {
m.tier2Notify = fn
}
// CaptureRecoveryUnit writes/refreshes an app's secret-free recovery unit: it captures the
// compose + metadata + a secret-stripped app.yaml into compose/, enumerates the DB/volume dumps
// already present, and writes manifest.json. It NEVER writes a secret value or the Docker image.
//
// Idempotent: it builds the captured content in memory first and SKIPS all writes when the unit is
// already current (same config checksums, same dump set, same controller version) — so it can run on
// the periodic status refresh without thrashing a spinning USB drive.
func (m *Manager) CaptureRecoveryUnit(stackName string) error {
if m.stackProvider == nil {
return fmt.Errorf("no stack provider")
}
info, ok := m.stackProvider.GetStackRecoveryInfo(stackName)
if !ok {
return fmt.Errorf("stack %q not found", stackName)
}
drivePath := m.GetAppDrivePath(stackName)
if drivePath == "" || !filepath.IsAbs(drivePath) {
return fmt.Errorf("cannot determine absolute drive path for %s", stackName)
}
nsRoot := m.namespaceRoot(drivePath)
// Build the captured config CONTENT in memory (no writes yet) so we can checksum-compare.
type capFile struct {
name string
data []byte
perm os.FileMode
}
var files []capFile
checksums := make(map[string]string)
var configFiles []string
for _, fname := range []string{"docker-compose.yml", ".felhom.yml"} {
data, err := os.ReadFile(filepath.Join(info.StackDir, fname))
if err != nil {
continue // optional — capture whichever exist
}
files = append(files, capFile{fname, data, 0644})
checksums[fname] = sha256Hex(data)
configFiles = append(configFiles, fname)
}
appYaml := buildStrippedAppYaml(info)
files = append(files, capFile{"app.yaml", appYaml, 0600})
checksums["app.yaml"] = sha256Hex(appYaml)
configFiles = append(configFiles, "app.yaml")
dbDumps := listFileNames(AppDBDumpPath(nsRoot, stackName), ".sql")
volDumps := listFileNames(AppVolumeDumpPath(nsRoot, stackName), ".tar")
version := m.versionLocked()
manifestPath := RecoveryUnitManifestPath(nsRoot, stackName)
// Skip if the unit is already current — avoids needless drive writes on the periodic refresh.
if cur := readManifest(manifestPath); cur != nil &&
cur.ControllerVer == version &&
stringMapEqual(cur.Checksums, checksums) &&
stringSliceEqual(cur.DBDumps, dbDumps) &&
stringSliceEqual(cur.VolumeDumps, volDumps) {
return nil
}
composeDir := RecoveryUnitComposePath(nsRoot, stackName)
if err := os.MkdirAll(composeDir, 0755); err != nil {
return fmt.Errorf("creating recovery-unit compose dir: %w", err)
}
for _, f := range files {
if err := atomicWrite(filepath.Join(composeDir, f.name), f.data, f.perm); err != nil {
return fmt.Errorf("capturing %s: %w", f.name, err)
}
}
manifest := &RecoveryManifest{
SchemaVersion: 1,
AppName: stackName,
DisplayName: info.DisplayName,
ControllerVer: version,
CreatedAt: time.Now().UTC().Format(time.RFC3339),
Drive: drivePath,
NamespaceRoot: nsRoot,
ImagePins: info.ImagePins,
SecretEnvVars: info.SecretEnvVars,
DataKeyEnvVars: info.DataKeyEnvVars,
SecretSource: "guest app.yaml (live rootfs) or PBS whole-guest snapshot — never stored in this unit",
ConfigFiles: configFiles,
DBDumps: dbDumps,
VolumeDumps: volDumps,
Checksums: checksums,
}
if err := writeManifest(manifestPath, manifest); err != nil {
return fmt.Errorf("writing manifest: %w", err)
}
m.logger.Printf("[INFO] [backup] Recovery unit captured for %s → %s (images=%d, secrets-referenced=%d, data_keys=%d)",
stackName, RecoveryUnitPath(nsRoot, stackName), len(info.ImagePins), len(info.SecretEnvVars), len(info.DataKeyEnvVars))
return nil
}
// captureAllRecoveryUnits refreshes the recovery unit for every deployed stack. Best-effort:
// a per-app failure is logged and does not abort the others.
func (m *Manager) captureAllRecoveryUnits() {
if m.stackProvider == nil {
return
}
for _, stack := range m.stackProvider.ListDeployedStacks() {
drivePath := m.GetAppDrivePath(stack.Name)
if m.settings != nil && (m.settings.IsDisconnected(drivePath) || m.settings.IsDecommissioned(drivePath)) {
continue // drive not writable — skip, the existing unit stays as-is
}
if err := m.CaptureRecoveryUnit(stack.Name); err != nil {
m.logger.Printf("[WARN] [backup] Recovery unit capture failed for %s: %v", stack.Name, err)
}
}
}
func (m *Manager) versionLocked() string {
m.mu.Lock()
defer m.mu.Unlock()
return m.version
}
// strippedAppYaml is the on-disk shape of the secret-free app.yaml captured into the unit.
type strippedAppYaml struct {
Deployed bool `yaml:"deployed"`
Env map[string]string `yaml:"env"`
}
// buildStrippedAppYaml renders a secret-free app.yaml (non-secret env only) as bytes. Deterministic:
// yaml.v3 sorts map keys and the secret-name list comes in stable metadata order, so identical input
// yields identical bytes (needed for the checksum-skip guard).
func buildStrippedAppYaml(info RecoveryInfo) []byte {
body, err := yaml.Marshal(strippedAppYaml{Deployed: true, Env: info.NonSecretEnv})
if err != nil {
body = []byte("deployed: true\nenv: {}\n")
}
header := "# Captured by felhom-controller recovery unit — SECRET-FREE.\n" +
"# Secret/data-key values are intentionally omitted; recover them at restore from the\n" +
"# guest's own app.yaml (live rootfs, or the PBS whole-guest snapshot). Stripped names:\n"
if len(info.SecretEnvVars) > 0 {
header += "# " + strings.Join(info.SecretEnvVars, ", ") + "\n"
}
return []byte(header + string(body))
}
// writeManifest writes the manifest JSON atomically.
func writeManifest(dst string, manifest *RecoveryManifest) error {
data, err := json.MarshalIndent(manifest, "", " ")
if err != nil {
return err
}
return atomicWrite(dst, append(data, '\n'), 0644)
}
// readManifest reads an existing recovery-unit manifest (nil if absent or unparseable).
func readManifest(path string) *RecoveryManifest {
data, err := os.ReadFile(path)
if err != nil {
return nil
}
var m RecoveryManifest
if json.Unmarshal(data, &m) != nil {
return nil
}
return &m
}
func sha256Hex(data []byte) string {
sum := sha256.Sum256(data)
return hex.EncodeToString(sum[:])
}
func stringMapEqual(a, b map[string]string) bool {
if len(a) != len(b) {
return false
}
for k, v := range a {
if b[k] != v {
return false
}
}
return true
}
func stringSliceEqual(a, b []string) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}
// listFileNames returns the names of files with the given suffix in dir (sorted, none if absent).
func listFileNames(dir, suffix string) []string {
entries, err := os.ReadDir(dir)
if err != nil {
return nil
}
var names []string
for _, e := range entries {
if !e.IsDir() && strings.HasSuffix(e.Name(), suffix) {
names = append(names, e.Name())
}
}
sort.Strings(names)
return names
}
// atomicWrite writes data to path via a .tmp file + rename.
func atomicWrite(path string, data []byte, perm os.FileMode) error {
tmp := path + ".tmp"
f, err := os.OpenFile(tmp, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, perm)
if err != nil {
return err
}
if _, err := io.Copy(f, strings.NewReader(string(data))); err != nil {
f.Close()
os.Remove(tmp)
return err
}
if err := f.Close(); err != nil {
os.Remove(tmp)
return err
}
if err := os.Rename(tmp, path); err != nil {
os.Remove(tmp)
return err
}
return nil
}