feat: backup safety — stop-before-dump, streaming restore, health check, per-app restic, infra configs (v0.34.0)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-28 08:56:48 +01:00
parent 783830a9d4
commit fb11c3b75a
8 changed files with 147 additions and 33 deletions
+39 -2
View File
@@ -138,6 +138,11 @@ func (m *Manager) RestoreApp(stackName, snapshotID string) error {
m.logger.Printf("[WARN] RESTORE could not restart %s after restore: %v", stackName, err)
}
// Verify app started successfully
if err := m.waitForHealthy(stackName, 90*time.Second); err != nil {
m.logger.Printf("[WARN] [backup] Restore completed but app health check failed: %v", err)
}
hasVolumes := len(m.stackProvider.GetDockerVolumes(stackName)) > 0
restoreType := "config+DB"
if hasHDD || hasVolumes {
@@ -267,8 +272,8 @@ func (m *Manager) RestoreAppFromTier2(stackName string) error {
if !e.IsDir() {
src := filepath.Join(dbSrc, e.Name())
dst := filepath.Join(dbDst, e.Name())
if data, err := os.ReadFile(src); err == nil {
os.WriteFile(dst, data, 0644)
if err := copyFile(src, dst); err != nil {
m.logger.Printf("[WARN] [backup] Failed to copy DB dump %s: %v", e.Name(), err)
}
}
}
@@ -291,6 +296,11 @@ func (m *Manager) RestoreAppFromTier2(stackName string) error {
m.logger.Printf("[WARN] RESTORE could not restart %s after Tier 2 restore: %v", stackName, err)
}
// Verify app started successfully
if err := m.waitForHealthy(stackName, 90*time.Second); err != nil {
m.logger.Printf("[WARN] [backup] Tier 2 restore completed but app health check failed: %v", err)
}
hasVolumes := len(m.stackProvider.GetDockerVolumes(stackName)) > 0
restoreType := "config+DB"
if hasHDD || hasVolumes {
@@ -403,3 +413,30 @@ func (m *Manager) restoreDockerVolumes(stackName, drivePath string) error {
}
return nil
}
// waitForHealthy waits for a stack to reach running state after restore.
// Forces a docker ps refresh on each poll to avoid stale state.
// Acceptable overhead for a rare operation (restore).
func (m *Manager) waitForHealthy(stackName string, timeout time.Duration) error {
deadline := time.Now().Add(timeout)
interval := 5 * time.Second
time.Sleep(3 * time.Second) // initial settling time
for time.Now().Before(deadline) {
if m.stackProvider == nil {
return fmt.Errorf("no stack provider")
}
if m.stackProvider.RefreshAndIsRunning(stackName) {
if m.isDebug() {
m.logger.Printf("[DEBUG] [backup] Post-restore health check: %s is running", stackName)
}
return nil
}
if m.isDebug() {
m.logger.Printf("[DEBUG] [backup] Post-restore health check: %s not yet running, waiting...", stackName)
}
time.Sleep(interval)
}
return fmt.Errorf("stack %s did not reach running state within %s after restore", stackName, timeout)
}