feat: backup safety — stop-before-dump, streaming restore, health check, per-app restic, infra configs (v0.34.0)
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -138,6 +138,11 @@ func (m *Manager) RestoreApp(stackName, snapshotID string) error {
|
||||
m.logger.Printf("[WARN] RESTORE could not restart %s after restore: %v", stackName, err)
|
||||
}
|
||||
|
||||
// Verify app started successfully
|
||||
if err := m.waitForHealthy(stackName, 90*time.Second); err != nil {
|
||||
m.logger.Printf("[WARN] [backup] Restore completed but app health check failed: %v", err)
|
||||
}
|
||||
|
||||
hasVolumes := len(m.stackProvider.GetDockerVolumes(stackName)) > 0
|
||||
restoreType := "config+DB"
|
||||
if hasHDD || hasVolumes {
|
||||
@@ -267,8 +272,8 @@ func (m *Manager) RestoreAppFromTier2(stackName string) error {
|
||||
if !e.IsDir() {
|
||||
src := filepath.Join(dbSrc, e.Name())
|
||||
dst := filepath.Join(dbDst, e.Name())
|
||||
if data, err := os.ReadFile(src); err == nil {
|
||||
os.WriteFile(dst, data, 0644)
|
||||
if err := copyFile(src, dst); err != nil {
|
||||
m.logger.Printf("[WARN] [backup] Failed to copy DB dump %s: %v", e.Name(), err)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -291,6 +296,11 @@ func (m *Manager) RestoreAppFromTier2(stackName string) error {
|
||||
m.logger.Printf("[WARN] RESTORE could not restart %s after Tier 2 restore: %v", stackName, err)
|
||||
}
|
||||
|
||||
// Verify app started successfully
|
||||
if err := m.waitForHealthy(stackName, 90*time.Second); err != nil {
|
||||
m.logger.Printf("[WARN] [backup] Tier 2 restore completed but app health check failed: %v", err)
|
||||
}
|
||||
|
||||
hasVolumes := len(m.stackProvider.GetDockerVolumes(stackName)) > 0
|
||||
restoreType := "config+DB"
|
||||
if hasHDD || hasVolumes {
|
||||
@@ -403,3 +413,30 @@ func (m *Manager) restoreDockerVolumes(stackName, drivePath string) error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// waitForHealthy waits for a stack to reach running state after restore.
|
||||
// Forces a docker ps refresh on each poll to avoid stale state.
|
||||
// Acceptable overhead for a rare operation (restore).
|
||||
func (m *Manager) waitForHealthy(stackName string, timeout time.Duration) error {
|
||||
deadline := time.Now().Add(timeout)
|
||||
interval := 5 * time.Second
|
||||
|
||||
time.Sleep(3 * time.Second) // initial settling time
|
||||
|
||||
for time.Now().Before(deadline) {
|
||||
if m.stackProvider == nil {
|
||||
return fmt.Errorf("no stack provider")
|
||||
}
|
||||
if m.stackProvider.RefreshAndIsRunning(stackName) {
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] [backup] Post-restore health check: %s is running", stackName)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] [backup] Post-restore health check: %s not yet running, waiting...", stackName)
|
||||
}
|
||||
time.Sleep(interval)
|
||||
}
|
||||
return fmt.Errorf("stack %s did not reach running state within %s after restore", stackName, timeout)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user