feat: backup safety — stop-before-dump, streaming restore, health check, per-app restic, infra configs (v0.34.0)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-28 08:56:48 +01:00
parent 783830a9d4
commit fb11c3b75a
8 changed files with 147 additions and 33 deletions
+1
View File
@@ -22,6 +22,7 @@ type StackDataProvider interface {
GetDockerVolumes(name string) []string // full Docker volume names (project-prefixed)
StopStack(name string) error
StartStack(name string) error
RefreshAndIsRunning(name string) bool
}
// StackSummary holds minimal stack info needed for app data discovery.
+46 -16
View File
@@ -366,12 +366,6 @@ func (m *Manager) runBackupInternal(ctx context.Context) error {
return nil
}
// Infrastructure paths included in every drive's primary repo
infraPaths := []string{
m.cfg.Paths.StacksDir,
"/opt/docker/felhom-controller/controller.yaml",
}
var lastResult *SnapshotResult
var anyErr error
driveCount := 0
@@ -380,7 +374,7 @@ func (m *Manager) runBackupInternal(ctx context.Context) error {
if m.isDebug() {
m.logger.Printf("[DEBUG] runBackupInternal: processing drive %s (%d stacks)", drivePath, len(stacks))
}
result, err := m.backupDrive(ctx, drivePath, stacks, infraPaths)
result, err := m.backupDrive(ctx, drivePath, stacks)
if err != nil {
anyErr = err
continue
@@ -452,7 +446,7 @@ func (m *Manager) runBackupInternal(ctx context.Context) error {
// backupDrive runs restic backup for a single drive. Returns nil result if skipped.
// Caller must hold the running flag.
func (m *Manager) backupDrive(ctx context.Context, drivePath string, stacks []StackSummary, infraPaths []string) (*SnapshotResult, error) {
func (m *Manager) backupDrive(ctx context.Context, drivePath string, stacks []StackSummary) (*SnapshotResult, error) {
// Skip disconnected or decommissioned drives
if m.settings != nil && m.settings.IsDisconnected(drivePath) {
m.logger.Printf("[WARN] [backup] Skipping backup for drive %s — disconnected", drivePath)
@@ -473,7 +467,11 @@ func (m *Manager) backupDrive(ctx context.Context, drivePath string, stacks []St
// Build paths for this drive
var paths []string
paths = append(paths, infraPaths...)
// Include controller.yaml only on the system drive
if drivePath == m.systemDataPath {
paths = append(paths, "/opt/docker/felhom-controller/controller.yaml")
}
for _, stack := range stacks {
// App data (appdata/<stack>/)
@@ -499,6 +497,11 @@ func (m *Manager) backupDrive(ctx context.Context, drivePath string, stacks []St
if _, err := os.Stat(volDumpDir); err == nil {
paths = append(paths, volDumpDir)
}
// Stack config dir (docker-compose.yml, app.yaml, .felhom.yml)
stackDir := filepath.Join(m.cfg.Paths.StacksDir, stack.Name)
if _, err := os.Stat(stackDir); err == nil {
paths = append(paths, stackDir)
}
}
// Deduplicate paths
@@ -558,12 +561,7 @@ func (m *Manager) TryRunDriveBackup(ctx context.Context, drivePath string) error
return nil
}
infraPaths := []string{
m.cfg.Paths.StacksDir,
"/opt/docker/felhom-controller/controller.yaml",
}
result, err := m.backupDrive(ctx, drivePath, stacks, infraPaths)
result, err := m.backupDrive(ctx, drivePath, stacks)
if err != nil {
return err
}
@@ -702,7 +700,39 @@ func (m *Manager) DumpAppVolumes(stackName string) error {
return nil
}
// DumpAppVolumesSafe stops the stack before dumping volumes and restarts after.
// Prevents inconsistent tars of live database volumes (e.g. PostgreSQL).
// Protected stacks that reject StopStack will return an error — callers handle as warning.
func (m *Manager) DumpAppVolumesSafe(stackName string) error {
if m.stackProvider == nil {
return fmt.Errorf("no stack provider")
}
m.logger.Printf("[INFO] [backup] Stopping %s for safe volume dump", stackName)
if err := m.stackProvider.StopStack(stackName); err != nil {
return fmt.Errorf("could not stop %s for volume dump: %w", stackName, err)
}
dumpErr := m.DumpAppVolumes(stackName)
m.logger.Printf("[INFO] [backup] Restarting %s after volume dump", stackName)
startErr := m.stackProvider.StartStack(stackName)
if startErr != nil {
m.logger.Printf("[ERROR] [backup] Failed to restart %s after volume dump: %v", stackName, startErr)
}
// Surface both errors — callers must know if the app is left stopped
if dumpErr != nil && startErr != nil {
return fmt.Errorf("volume dump failed for %s: %v; restart also failed: %v", stackName, dumpErr, startErr)
}
if startErr != nil {
return fmt.Errorf("volume dump OK but restart failed for %s: %w", stackName, startErr)
}
return dumpErr
}
// runVolumeDumpsInternal dumps Docker named volumes for all deployed apps.
// Stops each stack before dumping for data consistency, restarts after.
func (m *Manager) runVolumeDumpsInternal(ctx context.Context) error {
if m.stackProvider == nil {
return nil
@@ -717,7 +747,7 @@ func (m *Manager) runVolumeDumpsInternal(ctx context.Context) error {
if ctx.Err() != nil {
return ctx.Err()
}
if err := m.DumpAppVolumes(stack.Name); err != nil {
if err := m.DumpAppVolumesSafe(stack.Name); err != nil {
m.logger.Printf("[WARN] [backup] Volume dump error for %s: %v", stack.Name, err)
failed++
} else {
+2 -1
View File
@@ -25,6 +25,7 @@ type DBDumper interface {
// VolumeDumper can dump Docker named volumes for a specific stack.
type VolumeDumper interface {
DumpAppVolumes(stackName string) error
DumpAppVolumesSafe(stackName string) error // stops stack before dump, restarts after
}
// CrossDriveRunner handles per-app backup to secondary storage.
@@ -144,7 +145,7 @@ func (r *CrossDriveRunner) RunAppBackup(ctx context.Context, stackName string) e
if r.debug {
r.logger.Printf("[DEBUG] RunAppBackup: triggering pre-backup volume dump for %s", stackName)
}
if err := r.volDumper.DumpAppVolumes(stackName); err != nil {
if err := r.volDumper.DumpAppVolumesSafe(stackName); err != nil {
r.logger.Printf("[WARN] [backup] Pre-backup volume dump failed for %s: %v — proceeding with backup", stackName, err)
}
}
+39 -2
View File
@@ -138,6 +138,11 @@ func (m *Manager) RestoreApp(stackName, snapshotID string) error {
m.logger.Printf("[WARN] RESTORE could not restart %s after restore: %v", stackName, err)
}
// Verify app started successfully
if err := m.waitForHealthy(stackName, 90*time.Second); err != nil {
m.logger.Printf("[WARN] [backup] Restore completed but app health check failed: %v", err)
}
hasVolumes := len(m.stackProvider.GetDockerVolumes(stackName)) > 0
restoreType := "config+DB"
if hasHDD || hasVolumes {
@@ -267,8 +272,8 @@ func (m *Manager) RestoreAppFromTier2(stackName string) error {
if !e.IsDir() {
src := filepath.Join(dbSrc, e.Name())
dst := filepath.Join(dbDst, e.Name())
if data, err := os.ReadFile(src); err == nil {
os.WriteFile(dst, data, 0644)
if err := copyFile(src, dst); err != nil {
m.logger.Printf("[WARN] [backup] Failed to copy DB dump %s: %v", e.Name(), err)
}
}
}
@@ -291,6 +296,11 @@ func (m *Manager) RestoreAppFromTier2(stackName string) error {
m.logger.Printf("[WARN] RESTORE could not restart %s after Tier 2 restore: %v", stackName, err)
}
// Verify app started successfully
if err := m.waitForHealthy(stackName, 90*time.Second); err != nil {
m.logger.Printf("[WARN] [backup] Tier 2 restore completed but app health check failed: %v", err)
}
hasVolumes := len(m.stackProvider.GetDockerVolumes(stackName)) > 0
restoreType := "config+DB"
if hasHDD || hasVolumes {
@@ -403,3 +413,30 @@ func (m *Manager) restoreDockerVolumes(stackName, drivePath string) error {
}
return nil
}
// waitForHealthy waits for a stack to reach running state after restore.
// Forces a docker ps refresh on each poll to avoid stale state.
// Acceptable overhead for a rare operation (restore).
func (m *Manager) waitForHealthy(stackName string, timeout time.Duration) error {
deadline := time.Now().Add(timeout)
interval := 5 * time.Second
time.Sleep(3 * time.Second) // initial settling time
for time.Now().Before(deadline) {
if m.stackProvider == nil {
return fmt.Errorf("no stack provider")
}
if m.stackProvider.RefreshAndIsRunning(stackName) {
if m.isDebug() {
m.logger.Printf("[DEBUG] [backup] Post-restore health check: %s is running", stackName)
}
return nil
}
if m.isDebug() {
m.logger.Printf("[DEBUG] [backup] Post-restore health check: %s not yet running, waiting...", stackName)
}
time.Sleep(interval)
}
return fmt.Errorf("stack %s did not reach running state within %s after restore", stackName, timeout)
}
+28 -7
View File
@@ -5,6 +5,7 @@ import (
"fmt"
"log"
"os"
"path/filepath"
"time"
"gitea.dooplex.hu/admin/felhom-controller/internal/backup"
@@ -30,11 +31,18 @@ type InfraBackup struct {
}
// InfraStack identifies a deployed app for disaster recovery.
// Note: AppYamlB64 contains encrypted secrets (ENC:... values).
// The encryption key is also in this backup (EncryptionKeyB64).
// This is intentional — the infra backup must be self-contained for DR.
// Physical security of the backup media protects both.
type InfraStack struct {
Name string `json:"name"`
DisplayName string `json:"display_name"`
HDDPath string `json:"hdd_path,omitempty"`
NeedsHDD bool `json:"needs_hdd"`
Name string `json:"name"`
DisplayName string `json:"display_name"`
HDDPath string `json:"hdd_path,omitempty"`
NeedsHDD bool `json:"needs_hdd"`
DockerComposeB64 string `json:"docker_compose_b64,omitempty"`
AppYamlB64 string `json:"app_yaml_b64,omitempty"`
FelhomYamlB64 string `json:"felhom_yaml_b64,omitempty"`
}
// BuildInfraBackup collects all infrastructure state for Hub backup.
@@ -89,15 +97,28 @@ func BuildInfraBackup(
// Collect disk layout from fstab + blkid
ib.DiskLayout = collectDiskLayout(systemDataPath)
// Collect deployed stacks
// Collect deployed stacks (including actual config files for DR)
deployed := stackProvider.ListDeployedStacks()
for _, s := range deployed {
ib.DeployedStacks = append(ib.DeployedStacks, InfraStack{
is := InfraStack{
Name: s.Name,
DisplayName: s.DisplayName,
HDDPath: stackProvider.GetStackHDDPath(s.Name),
NeedsHDD: s.NeedsHDD,
})
}
if composePath, ok := stackProvider.GetStackComposePath(s.Name); ok {
stackDir := filepath.Dir(composePath)
if data, err := os.ReadFile(filepath.Join(stackDir, "docker-compose.yml")); err == nil {
is.DockerComposeB64 = base64.StdEncoding.EncodeToString(data)
}
if data, err := os.ReadFile(filepath.Join(stackDir, "app.yaml")); err == nil {
is.AppYamlB64 = base64.StdEncoding.EncodeToString(data)
}
if data, err := os.ReadFile(filepath.Join(stackDir, ".felhom.yml")); err == nil {
is.FelhomYamlB64 = base64.StdEncoding.EncodeToString(data)
}
}
ib.DeployedStacks = append(ib.DeployedStacks, is)
}
if ib.DeployedStacks == nil {
ib.DeployedStacks = []InfraStack{}