feat: backup safety — stop-before-dump, streaming restore, health check, per-app restic, infra configs (v0.34.0)
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -366,12 +366,6 @@ func (m *Manager) runBackupInternal(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Infrastructure paths included in every drive's primary repo
|
||||
infraPaths := []string{
|
||||
m.cfg.Paths.StacksDir,
|
||||
"/opt/docker/felhom-controller/controller.yaml",
|
||||
}
|
||||
|
||||
var lastResult *SnapshotResult
|
||||
var anyErr error
|
||||
driveCount := 0
|
||||
@@ -380,7 +374,7 @@ func (m *Manager) runBackupInternal(ctx context.Context) error {
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] runBackupInternal: processing drive %s (%d stacks)", drivePath, len(stacks))
|
||||
}
|
||||
result, err := m.backupDrive(ctx, drivePath, stacks, infraPaths)
|
||||
result, err := m.backupDrive(ctx, drivePath, stacks)
|
||||
if err != nil {
|
||||
anyErr = err
|
||||
continue
|
||||
@@ -452,7 +446,7 @@ func (m *Manager) runBackupInternal(ctx context.Context) error {
|
||||
|
||||
// backupDrive runs restic backup for a single drive. Returns nil result if skipped.
|
||||
// Caller must hold the running flag.
|
||||
func (m *Manager) backupDrive(ctx context.Context, drivePath string, stacks []StackSummary, infraPaths []string) (*SnapshotResult, error) {
|
||||
func (m *Manager) backupDrive(ctx context.Context, drivePath string, stacks []StackSummary) (*SnapshotResult, error) {
|
||||
// Skip disconnected or decommissioned drives
|
||||
if m.settings != nil && m.settings.IsDisconnected(drivePath) {
|
||||
m.logger.Printf("[WARN] [backup] Skipping backup for drive %s — disconnected", drivePath)
|
||||
@@ -473,7 +467,11 @@ func (m *Manager) backupDrive(ctx context.Context, drivePath string, stacks []St
|
||||
|
||||
// Build paths for this drive
|
||||
var paths []string
|
||||
paths = append(paths, infraPaths...)
|
||||
|
||||
// Include controller.yaml only on the system drive
|
||||
if drivePath == m.systemDataPath {
|
||||
paths = append(paths, "/opt/docker/felhom-controller/controller.yaml")
|
||||
}
|
||||
|
||||
for _, stack := range stacks {
|
||||
// App data (appdata/<stack>/)
|
||||
@@ -499,6 +497,11 @@ func (m *Manager) backupDrive(ctx context.Context, drivePath string, stacks []St
|
||||
if _, err := os.Stat(volDumpDir); err == nil {
|
||||
paths = append(paths, volDumpDir)
|
||||
}
|
||||
// Stack config dir (docker-compose.yml, app.yaml, .felhom.yml)
|
||||
stackDir := filepath.Join(m.cfg.Paths.StacksDir, stack.Name)
|
||||
if _, err := os.Stat(stackDir); err == nil {
|
||||
paths = append(paths, stackDir)
|
||||
}
|
||||
}
|
||||
|
||||
// Deduplicate paths
|
||||
@@ -558,12 +561,7 @@ func (m *Manager) TryRunDriveBackup(ctx context.Context, drivePath string) error
|
||||
return nil
|
||||
}
|
||||
|
||||
infraPaths := []string{
|
||||
m.cfg.Paths.StacksDir,
|
||||
"/opt/docker/felhom-controller/controller.yaml",
|
||||
}
|
||||
|
||||
result, err := m.backupDrive(ctx, drivePath, stacks, infraPaths)
|
||||
result, err := m.backupDrive(ctx, drivePath, stacks)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -702,7 +700,39 @@ func (m *Manager) DumpAppVolumes(stackName string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// DumpAppVolumesSafe stops the stack before dumping volumes and restarts after.
|
||||
// Prevents inconsistent tars of live database volumes (e.g. PostgreSQL).
|
||||
// Protected stacks that reject StopStack will return an error — callers handle as warning.
|
||||
func (m *Manager) DumpAppVolumesSafe(stackName string) error {
|
||||
if m.stackProvider == nil {
|
||||
return fmt.Errorf("no stack provider")
|
||||
}
|
||||
|
||||
m.logger.Printf("[INFO] [backup] Stopping %s for safe volume dump", stackName)
|
||||
if err := m.stackProvider.StopStack(stackName); err != nil {
|
||||
return fmt.Errorf("could not stop %s for volume dump: %w", stackName, err)
|
||||
}
|
||||
|
||||
dumpErr := m.DumpAppVolumes(stackName)
|
||||
|
||||
m.logger.Printf("[INFO] [backup] Restarting %s after volume dump", stackName)
|
||||
startErr := m.stackProvider.StartStack(stackName)
|
||||
if startErr != nil {
|
||||
m.logger.Printf("[ERROR] [backup] Failed to restart %s after volume dump: %v", stackName, startErr)
|
||||
}
|
||||
|
||||
// Surface both errors — callers must know if the app is left stopped
|
||||
if dumpErr != nil && startErr != nil {
|
||||
return fmt.Errorf("volume dump failed for %s: %v; restart also failed: %v", stackName, dumpErr, startErr)
|
||||
}
|
||||
if startErr != nil {
|
||||
return fmt.Errorf("volume dump OK but restart failed for %s: %w", stackName, startErr)
|
||||
}
|
||||
return dumpErr
|
||||
}
|
||||
|
||||
// runVolumeDumpsInternal dumps Docker named volumes for all deployed apps.
|
||||
// Stops each stack before dumping for data consistency, restarts after.
|
||||
func (m *Manager) runVolumeDumpsInternal(ctx context.Context) error {
|
||||
if m.stackProvider == nil {
|
||||
return nil
|
||||
@@ -717,7 +747,7 @@ func (m *Manager) runVolumeDumpsInternal(ctx context.Context) error {
|
||||
if ctx.Err() != nil {
|
||||
return ctx.Err()
|
||||
}
|
||||
if err := m.DumpAppVolumes(stack.Name); err != nil {
|
||||
if err := m.DumpAppVolumesSafe(stack.Name); err != nil {
|
||||
m.logger.Printf("[WARN] [backup] Volume dump error for %s: %v", stack.Name, err)
|
||||
failed++
|
||||
} else {
|
||||
|
||||
Reference in New Issue
Block a user