package backup import ( "context" "fmt" "os" "os/exec" "path/filepath" "regexp" "strings" "time" ) // snapshotIDRe validates restic snapshot IDs: 8-64 lowercase hex characters. var snapshotIDRe = regexp.MustCompile(`^[0-9a-f]{8,64}$`) // RestoreApp restores an app from a restic snapshot. // All apps get config + DB dump restored. Apps with HDD data also get user data restored. func (m *Manager) RestoreApp(stackName, snapshotID string) error { if m.stackProvider == nil { return fmt.Errorf("stack provider not configured") } // Validate snapshot ID format if !snapshotIDRe.MatchString(snapshotID) { return fmt.Errorf("invalid snapshot ID: must be 8-64 lowercase hex characters") } if m.isDebug() { m.logger.Printf("[DEBUG] RestoreApp: stack=%s, snapshotID=%s", stackName, snapshotID) } // Prevent concurrent operations m.mu.Lock() if m.running { m.mu.Unlock() return fmt.Errorf("backup or restore already in progress") } m.running = true m.mu.Unlock() defer func() { m.mu.Lock() m.running = false m.mu.Unlock() }() // Determine what to restore hddMounts := m.stackProvider.GetStackHDDMounts(stackName) hasHDD := len(hddMounts) > 0 if m.isDebug() { m.logger.Printf("[DEBUG] RestoreApp: %s has %d HDD mount(s), hasHDD=%v", stackName, len(hddMounts), hasHDD) } // Build list of paths to restore from the snapshot var restorePaths []string // Always restore the stack's config dir (compose + app.yaml + .felhom.yml) composePath, ok := m.stackProvider.GetStackComposePath(stackName) if ok { stackDir := filepath.Dir(composePath) restorePaths = append(restorePaths, stackDir) if m.isDebug() { m.logger.Printf("[DEBUG] RestoreApp: will restore config dir: %s", stackDir) } } // Restore DB dump files for this stack (per-drive path) drivePath := m.GetAppDrivePath(stackName) dumpDir := AppDBDumpPath(drivePath, stackName) restorePaths = append(restorePaths, dumpDir) if m.isDebug() { m.logger.Printf("[DEBUG] RestoreApp: will restore DB dump dir: %s", dumpDir) } // Restore HDD data (always included for apps that have it — backup is mandatory) if hasHDD { restorePaths = append(restorePaths, hddMounts...) if m.isDebug() { m.logger.Printf("[DEBUG] RestoreApp: will restore HDD data: %v", hddMounts) } } // Restore Docker volume dumps (if present in snapshot) volDumpDir := AppVolumeDumpPath(drivePath, stackName) restorePaths = append(restorePaths, volDumpDir) if len(restorePaths) == 0 { return fmt.Errorf("no restorable paths found for %s", stackName) } // Use the app's primary restic repo repoPath := PrimaryResticRepoPath(drivePath) if m.isDebug() { m.logger.Printf("[DEBUG] RestoreApp: using repo=%s, %d restore path(s)", repoPath, len(restorePaths)) } m.logger.Printf("[INFO] [backup] Starting restore for %s (snapshot=%s, repo=%s, paths=%v, hasHDD=%v)", stackName, snapshotID, repoPath, restorePaths, hasHDD) // Stop the app before restore if m.isDebug() { m.logger.Printf("[DEBUG] RestoreApp: step 1/4 — stopping app %s", stackName) } if err := m.stackProvider.StopStack(stackName); err != nil { m.logger.Printf("[WARN] RESTORE could not stop %s: %v (proceeding anyway)", stackName, err) } // Execute restore via restic if m.isDebug() { m.logger.Printf("[DEBUG] RestoreApp: step 2/4 — restoring data from snapshot %s", snapshotID) } if err := m.restic.RestoreAppData(repoPath, snapshotID, restorePaths); err != nil { m.logger.Printf("[ERROR] RESTORE failed for %s: %v", stackName, err) if m.isDebug() { m.logger.Printf("[DEBUG] RestoreApp: step 3/4 — restarting app %s after failure", stackName) } if startErr := m.stackProvider.StartStack(stackName); startErr != nil { m.logger.Printf("[WARN] RESTORE could not restart %s after failure: %v", stackName, startErr) } return err } // Populate Docker volumes from restored tars if m.isDebug() { m.logger.Printf("[DEBUG] RestoreApp: step 3/5 — restoring Docker volumes for %s", stackName) } if err := m.restoreDockerVolumes(stackName, drivePath); err != nil { m.logger.Printf("[WARN] RESTORE volume restore failed for %s: %v (continuing)", stackName, err) } // Restart the app if m.isDebug() { m.logger.Printf("[DEBUG] RestoreApp: step 4/5 — restarting app %s after successful restore", stackName) } if err := m.stackProvider.StartStack(stackName); err != nil { m.logger.Printf("[WARN] RESTORE could not restart %s after restore: %v", stackName, err) } // Verify app started successfully if err := m.waitForHealthy(stackName, 90*time.Second); err != nil { m.logger.Printf("[WARN] [backup] Restore completed but app health check failed: %v", err) } hasVolumes := len(m.stackProvider.GetDockerVolumes(stackName)) > 0 restoreType := "config+DB" if hasHDD || hasVolumes { restoreType = "full (config+DB+userdata)" } if m.isDebug() { m.logger.Printf("[DEBUG] RestoreApp: step 5/5 — restore completed, type=%s", restoreType) } m.logger.Printf("[INFO] RESTORE completed: stack=%s, snapshot=%s, type=%s", stackName, snapshotID, restoreType) return nil } // RestoreAppFromTier2 restores an app from its cross-drive rsync backup mirror. func (m *Manager) RestoreAppFromTier2(stackName string) error { if m.stackProvider == nil { return fmt.Errorf("stack provider not configured") } if m.settings == nil { return fmt.Errorf("settings not available") } cdCfg := m.settings.GetCrossDriveConfig(stackName) if cdCfg == nil || !cdCfg.Enabled { return fmt.Errorf("cross-drive backup not configured for %s", stackName) } rsyncDir := AppSecondaryRsyncPath(cdCfg.DestinationPath, stackName) if _, err := os.Stat(rsyncDir); os.IsNotExist(err) { return fmt.Errorf("Tier 2 backup directory not found: %s", rsyncDir) } if m.isDebug() { m.logger.Printf("[DEBUG] RestoreAppFromTier2: stack=%s, rsyncDir=%s", stackName, rsyncDir) } // Prevent concurrent operations m.mu.Lock() if m.running { m.mu.Unlock() return fmt.Errorf("backup or restore already in progress") } m.running = true m.mu.Unlock() defer func() { m.mu.Lock() m.running = false m.mu.Unlock() }() hddMounts := m.stackProvider.GetStackHDDMounts(stackName) hasHDD := len(hddMounts) > 0 drivePath := m.GetAppDrivePath(stackName) m.logger.Printf("[INFO] [backup] Starting Tier 2 restore for %s from %s", stackName, rsyncDir) // Step 1: Stop the app if err := m.stackProvider.StopStack(stackName); err != nil { m.logger.Printf("[WARN] RESTORE could not stop %s: %v (proceeding anyway)", stackName, err) } // Step 2: Restore config from _config/ configSrc := filepath.Join(rsyncDir, "_config") + "/" if _, err := os.Stat(filepath.Join(rsyncDir, "_config")); err == nil { if composePath, ok := m.stackProvider.GetStackComposePath(stackName); ok { configDst := filepath.Dir(composePath) + "/" if m.isDebug() { m.logger.Printf("[DEBUG] RestoreAppFromTier2: rsync config %s → %s", configSrc, configDst) } cmd := exec.Command("rsync", "-a", "--delete", configSrc, configDst) if out, err := cmd.CombinedOutput(); err != nil { m.logger.Printf("[ERROR] [backup] Tier 2 config restore failed for %s: %v (%s)", stackName, err, strings.TrimSpace(string(out))) // Try to restart and return error m.stackProvider.StartStack(stackName) return fmt.Errorf("config restore failed: %w", err) } } } // Step 3: Restore HDD data if hasHDD { // Check for data directory structure — single mount vs multi-mount if len(hddMounts) == 1 { // Single mount: data is directly in rsyncDir (excluding _* dirs) src := strings.TrimRight(rsyncDir, "/") + "/" dst := strings.TrimRight(hddMounts[0], "/") + "/" if m.isDebug() { m.logger.Printf("[DEBUG] RestoreAppFromTier2: rsync HDD data %s → %s", src, dst) } cmd := exec.Command("rsync", "-a", "--delete", "--exclude", "_*", src, dst) if out, err := cmd.CombinedOutput(); err != nil { m.logger.Printf("[ERROR] [backup] Tier 2 HDD data restore failed for %s: %v (%s)", stackName, err, strings.TrimSpace(string(out))) m.stackProvider.StartStack(stackName) return fmt.Errorf("HDD data restore failed: %w", err) } } else { // Multiple mounts: each has a subdirectory named by leaf for _, mount := range hddMounts { leaf := filepath.Base(mount) src := filepath.Join(rsyncDir, leaf) + "/" dst := strings.TrimRight(mount, "/") + "/" if _, err := os.Stat(filepath.Join(rsyncDir, leaf)); os.IsNotExist(err) { m.logger.Printf("[WARN] [backup] Tier 2 restore: no backup data for mount %s", mount) continue } if m.isDebug() { m.logger.Printf("[DEBUG] RestoreAppFromTier2: rsync HDD mount %s → %s", src, dst) } cmd := exec.Command("rsync", "-a", "--delete", src, dst) if out, err := cmd.CombinedOutput(); err != nil { m.logger.Printf("[ERROR] [backup] Tier 2 HDD restore failed for mount %s: %v (%s)", mount, err, strings.TrimSpace(string(out))) m.stackProvider.StartStack(stackName) return fmt.Errorf("HDD restore failed for %s: %w", mount, err) } } } } // Step 4: Restore DB dumps from _db/ dbSrc := filepath.Join(rsyncDir, "_db") if _, err := os.Stat(dbSrc); err == nil { dbDst := AppDBDumpPath(drivePath, stackName) if err := os.MkdirAll(dbDst, 0755); err == nil { entries, _ := os.ReadDir(dbSrc) for _, e := range entries { if !e.IsDir() { src := filepath.Join(dbSrc, e.Name()) dst := filepath.Join(dbDst, e.Name()) if err := copyFile(src, dst); err != nil { m.logger.Printf("[WARN] [backup] Failed to copy DB dump %s: %v", e.Name(), err) } } } if m.isDebug() { m.logger.Printf("[DEBUG] RestoreAppFromTier2: restored DB dumps from %s", dbSrc) } } } // Step 5: Restore Docker volumes from _volumes/ volSrc := filepath.Join(rsyncDir, "_volumes") if _, err := os.Stat(volSrc); err == nil { if err := m.restoreDockerVolumesFromDir(stackName, volSrc); err != nil { m.logger.Printf("[WARN] [backup] Tier 2 volume restore failed for %s: %v (continuing)", stackName, err) } } // Step 6: Restart the app if err := m.stackProvider.StartStack(stackName); err != nil { m.logger.Printf("[WARN] RESTORE could not restart %s after Tier 2 restore: %v", stackName, err) } // Verify app started successfully if err := m.waitForHealthy(stackName, 90*time.Second); err != nil { m.logger.Printf("[WARN] [backup] Tier 2 restore completed but app health check failed: %v", err) } hasVolumes := len(m.stackProvider.GetDockerVolumes(stackName)) > 0 restoreType := "config+DB" if hasHDD || hasVolumes { restoreType = "full (config+DB+userdata)" } m.logger.Printf("[INFO] RESTORE (Tier 2) completed: stack=%s, type=%s", stackName, restoreType) return nil } // restoreDockerVolumesFromDir populates Docker volumes from tar files in an arbitrary directory. // Used by Tier 2 restore where volume tars are in the rsync mirror's _volumes/ dir. func (m *Manager) restoreDockerVolumesFromDir(stackName, dumpDir string) error { entries, err := os.ReadDir(dumpDir) if err != nil { if os.IsNotExist(err) { return nil } return fmt.Errorf("reading volume dump dir: %w", err) } var restored int for _, entry := range entries { if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".tar") { continue } volName := strings.TrimSuffix(entry.Name(), ".tar") m.logger.Printf("[INFO] [backup] Restoring Docker volume %s for %s (Tier 2)", volName, stackName) exec.Command("docker", "volume", "rm", "-f", volName).Run() if out, err := exec.Command("docker", "volume", "create", volName).CombinedOutput(); err != nil { m.logger.Printf("[WARN] [backup] Failed to create volume %s: %s — %v", volName, strings.TrimSpace(string(out)), err) continue } ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) cmd := exec.CommandContext(ctx, "docker", "run", "--rm", "-v", volName+":/vol", "-v", dumpDir+":/in:ro", "alpine", "tar", "xf", "/in/"+entry.Name(), "-C", "/vol") out, err := cmd.CombinedOutput() cancel() if err != nil { m.logger.Printf("[WARN] [backup] Failed to populate volume %s: %s — %v", volName, strings.TrimSpace(string(out)), err) continue } restored++ } if restored > 0 { m.logger.Printf("[INFO] [backup] Restored %d Docker volume(s) for %s (Tier 2)", restored, stackName) } return nil } // restoreDockerVolumes populates Docker volumes from tar files in the volume dump directory. func (m *Manager) restoreDockerVolumes(stackName, drivePath string) error { dumpDir := AppVolumeDumpPath(drivePath, stackName) entries, err := os.ReadDir(dumpDir) if err != nil { if os.IsNotExist(err) { return nil // No volume dumps to restore } return fmt.Errorf("reading volume dump dir: %w", err) } var restored int for _, entry := range entries { if entry.IsDir() || !strings.HasSuffix(entry.Name(), ".tar") { continue } volName := strings.TrimSuffix(entry.Name(), ".tar") m.logger.Printf("[INFO] [backup] Restoring Docker volume %s for %s", volName, stackName) // Remove existing volume (ignore errors — may not exist) exec.Command("docker", "volume", "rm", "-f", volName).Run() // Create fresh volume if out, err := exec.Command("docker", "volume", "create", volName).CombinedOutput(); err != nil { m.logger.Printf("[WARN] [backup] Failed to create volume %s: %s — %v", volName, strings.TrimSpace(string(out)), err) continue } // Populate from tar ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) cmd := exec.CommandContext(ctx, "docker", "run", "--rm", "-v", volName+":/vol", "-v", dumpDir+":/in:ro", "alpine", "tar", "xf", "/in/"+entry.Name(), "-C", "/vol") out, err := cmd.CombinedOutput() cancel() if err != nil { m.logger.Printf("[WARN] [backup] Failed to populate volume %s: %s — %v", volName, strings.TrimSpace(string(out)), err) continue } restored++ if m.isDebug() { m.logger.Printf("[DEBUG] [backup] Volume %s restored successfully", volName) } } if restored > 0 { m.logger.Printf("[INFO] [backup] Restored %d Docker volume(s) for %s", restored, stackName) } return nil } // waitForHealthy waits for a stack to reach running state after restore. // Forces a docker ps refresh on each poll to avoid stale state. // Acceptable overhead for a rare operation (restore). func (m *Manager) waitForHealthy(stackName string, timeout time.Duration) error { deadline := time.Now().Add(timeout) interval := 5 * time.Second time.Sleep(3 * time.Second) // initial settling time for time.Now().Before(deadline) { if m.stackProvider == nil { return fmt.Errorf("no stack provider") } if m.stackProvider.RefreshAndIsRunning(stackName) { if m.isDebug() { m.logger.Printf("[DEBUG] [backup] Post-restore health check: %s is running", stackName) } return nil } if m.isDebug() { m.logger.Printf("[DEBUG] [backup] Post-restore health check: %s not yet running, waiting...", stackName) } time.Sleep(interval) } return fmt.Errorf("stack %s did not reach running state within %s after restore", stackName, timeout) }