feat: drive migration & Tier 2 restic deprecation (v0.18.0)
Phase 1: Deprecate restic as Tier 2 method (rsync only), auto-migrate on startup Phase 2: Enhanced per-app migration with backup awareness, DB dump copy, auto-cleanup Phase 3: Full drive migration with decommissioned state, rollback support, wizard UI Phase 4: Hub report includes decommissioned drive state Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -43,6 +43,10 @@ type Manager struct {
|
||||
// AfterBackup is called after a backup completes to refresh the cache.
|
||||
// Set by main.go to avoid circular import with scheduler.
|
||||
AfterBackup func()
|
||||
|
||||
// MigrationActiveCheck returns true if a full drive migration is in progress.
|
||||
// Set by main.go to coordinate with DriveMigrator.
|
||||
MigrationActiveCheck func() bool
|
||||
}
|
||||
|
||||
// SnapshotRecord combines restic snapshot metadata with our run stats.
|
||||
@@ -243,12 +247,17 @@ func (m *Manager) runDBDumpsInternal(ctx context.Context) error {
|
||||
for _, db := range dbs {
|
||||
drivePath := m.GetAppDrivePath(db.StackName)
|
||||
|
||||
// Skip if drive is disconnected
|
||||
// Skip if drive is disconnected or decommissioned
|
||||
if m.settings != nil && m.settings.IsDisconnected(drivePath) {
|
||||
m.logger.Printf("[WARN] Skipping DB dump for %s — drive disconnected: %s", db.StackName, drivePath)
|
||||
summary = append(summary, fmt.Sprintf("SKIP %s (drive disconnected)", db.ContainerName))
|
||||
continue
|
||||
}
|
||||
if m.settings != nil && m.settings.IsDecommissioned(drivePath) {
|
||||
m.logger.Printf("[WARN] Skipping DB dump for %s — drive decommissioned: %s", db.StackName, drivePath)
|
||||
summary = append(summary, fmt.Sprintf("SKIP %s (drive decommissioned)", db.ContainerName))
|
||||
continue
|
||||
}
|
||||
|
||||
dumpDir := AppDBDumpPath(drivePath, db.StackName)
|
||||
|
||||
@@ -319,6 +328,12 @@ func (m *Manager) RunBackup(ctx context.Context) error {
|
||||
|
||||
// runBackupInternal is the implementation of RunBackup. Caller must hold the running flag.
|
||||
func (m *Manager) runBackupInternal(ctx context.Context) error {
|
||||
// Skip if a full drive migration is in progress
|
||||
if m.MigrationActiveCheck != nil && m.MigrationActiveCheck() {
|
||||
m.logger.Printf("[WARN] Skipping nightly backup — drive migration in progress")
|
||||
return nil
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
m.logger.Printf("[INFO] Starting restic backup (per-drive)")
|
||||
|
||||
@@ -339,68 +354,14 @@ func (m *Manager) runBackupInternal(ctx context.Context) error {
|
||||
driveCount := 0
|
||||
|
||||
for drivePath, stacks := range driveStacks {
|
||||
// Skip disconnected drives
|
||||
if m.settings != nil && m.settings.IsDisconnected(drivePath) {
|
||||
m.logger.Printf("[WARN] Skipping backup for drive %s — disconnected", drivePath)
|
||||
continue
|
||||
}
|
||||
|
||||
repoPath := PrimaryResticRepoPath(drivePath)
|
||||
|
||||
// Ensure repo is initialized
|
||||
if err := m.restic.EnsureInitialized(repoPath); err != nil {
|
||||
m.logger.Printf("[ERROR] Restic init failed for %s: %v", repoPath, err)
|
||||
anyErr = err
|
||||
continue
|
||||
}
|
||||
|
||||
// Build paths for this drive
|
||||
var paths []string
|
||||
paths = append(paths, infraPaths...)
|
||||
|
||||
for _, stack := range stacks {
|
||||
// App data (appdata/<stack>/)
|
||||
appData := AppDataDir(drivePath, stack.Name)
|
||||
if _, err := os.Stat(appData); err == nil {
|
||||
paths = append(paths, appData)
|
||||
}
|
||||
// HDD mounts (for apps with custom mount points)
|
||||
if m.stackProvider != nil {
|
||||
for _, mount := range m.stackProvider.GetStackHDDMounts(stack.Name) {
|
||||
if _, err := os.Stat(mount); err == nil {
|
||||
paths = append(paths, mount)
|
||||
}
|
||||
}
|
||||
}
|
||||
// DB dumps for this stack
|
||||
dumpDir := AppDBDumpPath(drivePath, stack.Name)
|
||||
if _, err := os.Stat(dumpDir); err == nil {
|
||||
paths = append(paths, dumpDir)
|
||||
}
|
||||
}
|
||||
|
||||
// Deduplicate paths
|
||||
paths = dedup(paths)
|
||||
|
||||
tags := []string{"felhom", m.cfg.Customer.ID, filepath.Base(drivePath)}
|
||||
m.logger.Printf("[INFO] Backing up drive %s (%d apps, %d paths)", drivePath, len(stacks), len(paths))
|
||||
|
||||
result, err := m.restic.Snapshot(repoPath, paths, tags)
|
||||
result, err := m.backupDrive(ctx, drivePath, stacks, infraPaths)
|
||||
if err != nil {
|
||||
m.logger.Printf("[ERROR] Restic backup failed for drive %s: %v", drivePath, err)
|
||||
anyErr = err
|
||||
continue
|
||||
}
|
||||
|
||||
lastResult = result
|
||||
driveCount++
|
||||
|
||||
// Prune check (weekly — Sunday)
|
||||
if shouldPrune(m.cfg.Backup.PruneSchedule) {
|
||||
m.logger.Printf("[INFO] Running weekly prune for %s", repoPath)
|
||||
if err := m.restic.Prune(repoPath, m.cfg.Backup.Retention); err != nil {
|
||||
m.logger.Printf("[WARN] Restic prune failed for %s: %v", repoPath, err)
|
||||
}
|
||||
if result != nil {
|
||||
lastResult = result
|
||||
driveCount++
|
||||
}
|
||||
}
|
||||
|
||||
@@ -463,6 +424,120 @@ func (m *Manager) runBackupInternal(ctx context.Context) error {
|
||||
return anyErr
|
||||
}
|
||||
|
||||
// backupDrive runs restic backup for a single drive. Returns nil result if skipped.
|
||||
// Caller must hold the running flag.
|
||||
func (m *Manager) backupDrive(ctx context.Context, drivePath string, stacks []StackSummary, infraPaths []string) (*SnapshotResult, error) {
|
||||
// Skip disconnected or decommissioned drives
|
||||
if m.settings != nil && m.settings.IsDisconnected(drivePath) {
|
||||
m.logger.Printf("[WARN] Skipping backup for drive %s — disconnected", drivePath)
|
||||
return nil, nil
|
||||
}
|
||||
if m.settings != nil && m.settings.IsDecommissioned(drivePath) {
|
||||
m.logger.Printf("[WARN] Skipping backup for drive %s — decommissioned", drivePath)
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
repoPath := PrimaryResticRepoPath(drivePath)
|
||||
|
||||
// Ensure repo is initialized
|
||||
if err := m.restic.EnsureInitialized(repoPath); err != nil {
|
||||
m.logger.Printf("[ERROR] Restic init failed for %s: %v", repoPath, err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Build paths for this drive
|
||||
var paths []string
|
||||
paths = append(paths, infraPaths...)
|
||||
|
||||
for _, stack := range stacks {
|
||||
// App data (appdata/<stack>/)
|
||||
appData := AppDataDir(drivePath, stack.Name)
|
||||
if _, err := os.Stat(appData); err == nil {
|
||||
paths = append(paths, appData)
|
||||
}
|
||||
// HDD mounts (for apps with custom mount points)
|
||||
if m.stackProvider != nil {
|
||||
for _, mount := range m.stackProvider.GetStackHDDMounts(stack.Name) {
|
||||
if _, err := os.Stat(mount); err == nil {
|
||||
paths = append(paths, mount)
|
||||
}
|
||||
}
|
||||
}
|
||||
// DB dumps for this stack
|
||||
dumpDir := AppDBDumpPath(drivePath, stack.Name)
|
||||
if _, err := os.Stat(dumpDir); err == nil {
|
||||
paths = append(paths, dumpDir)
|
||||
}
|
||||
}
|
||||
|
||||
// Deduplicate paths
|
||||
paths = dedup(paths)
|
||||
|
||||
tags := []string{"felhom", m.cfg.Customer.ID, filepath.Base(drivePath)}
|
||||
m.logger.Printf("[INFO] Backing up drive %s (%d apps, %d paths)", drivePath, len(stacks), len(paths))
|
||||
|
||||
result, err := m.restic.Snapshot(repoPath, paths, tags)
|
||||
if err != nil {
|
||||
m.logger.Printf("[ERROR] Restic backup failed for drive %s: %v", drivePath, err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Prune check (weekly — Sunday)
|
||||
if shouldPrune(m.cfg.Backup.PruneSchedule) {
|
||||
m.logger.Printf("[INFO] Running weekly prune for %s", repoPath)
|
||||
if err := m.restic.Prune(repoPath, m.cfg.Backup.Retention); err != nil {
|
||||
m.logger.Printf("[WARN] Restic prune failed for %s: %v", repoPath, err)
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// tryAcquireRunning attempts to set the running flag without blocking.
|
||||
// Returns true if acquired, false if already running.
|
||||
func (m *Manager) tryAcquireRunning() bool {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
if m.running {
|
||||
return false
|
||||
}
|
||||
m.running = true
|
||||
return true
|
||||
}
|
||||
|
||||
// TryRunDriveBackup runs a backup for a single drive if no other backup is in progress.
|
||||
// Returns error if the backup lock cannot be acquired or if backup fails.
|
||||
func (m *Manager) TryRunDriveBackup(ctx context.Context, drivePath string) error {
|
||||
if !m.tryAcquireRunning() {
|
||||
return fmt.Errorf("backup already in progress")
|
||||
}
|
||||
defer m.releaseRunning()
|
||||
|
||||
driveStacks := m.groupStacksByDrive()
|
||||
stacks, ok := driveStacks[drivePath]
|
||||
if !ok || len(stacks) == 0 {
|
||||
m.logger.Printf("[INFO] No deployed stacks on drive %s — skipping backup", drivePath)
|
||||
return nil
|
||||
}
|
||||
|
||||
infraPaths := []string{
|
||||
m.cfg.Paths.StacksDir,
|
||||
"/opt/docker/felhom-controller/controller.yaml",
|
||||
}
|
||||
|
||||
result, err := m.backupDrive(ctx, drivePath, stacks, infraPaths)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if result != nil {
|
||||
m.logger.Printf("[INFO] Single-drive backup for %s: snapshot %s, %d new, %d changed, %s added",
|
||||
drivePath, result.SnapshotID, result.FilesNew, result.FilesChanged, result.DataAdded)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// RunIntegrityCheck runs restic check on all primary repos and pings healthchecks.
|
||||
func (m *Manager) RunIntegrityCheck(ctx context.Context) error {
|
||||
m.logger.Printf("[INFO] Starting restic integrity check")
|
||||
@@ -596,13 +671,12 @@ func (m *Manager) ListSnapshots(limit int) ([]SnapshotInfo, error) {
|
||||
return allSnapshots, nil
|
||||
}
|
||||
|
||||
// ListAllSnapshots returns snapshots from both primary and secondary restic repos.
|
||||
// Primary snapshots get Tier=1, secondary snapshots get Tier=2.
|
||||
// ListAllSnapshots returns snapshots from primary restic repos across all active drives.
|
||||
// All snapshots get Tier=1.
|
||||
func (m *Manager) ListAllSnapshots(limit int) ([]SnapshotInfo, error) {
|
||||
drives := m.activeDrives()
|
||||
var allSnapshots []SnapshotInfo
|
||||
|
||||
// Tier 1: primary repos (same as ListSnapshots)
|
||||
for _, drive := range drives {
|
||||
repoPath := PrimaryResticRepoPath(drive)
|
||||
if !m.restic.RepoExists(repoPath) {
|
||||
@@ -620,32 +694,6 @@ func (m *Manager) ListAllSnapshots(limit int) ([]SnapshotInfo, error) {
|
||||
allSnapshots = append(allSnapshots, snapshots...)
|
||||
}
|
||||
|
||||
// Tier 2: secondary restic repos on cross-drive destinations
|
||||
if m.settings != nil {
|
||||
destPaths := make(map[string]bool)
|
||||
for _, cfg := range m.settings.GetAllCrossDriveConfigs() {
|
||||
if cfg != nil && cfg.Method == "restic" && cfg.DestinationPath != "" {
|
||||
destPaths[cfg.DestinationPath] = true
|
||||
}
|
||||
}
|
||||
for destPath := range destPaths {
|
||||
repoPath := SecondaryResticRepoPath(destPath)
|
||||
if !m.restic.RepoExists(repoPath) {
|
||||
continue
|
||||
}
|
||||
snapshots, err := m.restic.ListSnapshots(repoPath, 0)
|
||||
if err != nil {
|
||||
m.logger.Printf("[WARN] Could not list secondary snapshots from %s: %v", repoPath, err)
|
||||
continue
|
||||
}
|
||||
for i := range snapshots {
|
||||
snapshots[i].RepoPath = repoPath
|
||||
snapshots[i].Tier = 2
|
||||
}
|
||||
allSnapshots = append(allSnapshots, snapshots...)
|
||||
}
|
||||
}
|
||||
|
||||
// Sort newest first
|
||||
sort.Slice(allSnapshots, func(i, j int) bool {
|
||||
return allSnapshots[i].Time.After(allSnapshots[j].Time)
|
||||
|
||||
@@ -102,8 +102,8 @@ func (r *CrossDriveRunner) RunAppBackup(ctx context.Context, stackName string) e
|
||||
})
|
||||
|
||||
start := time.Now()
|
||||
r.logger.Printf("[INFO] Cross-drive backup starting: %s → %s (method: %s)",
|
||||
stackName, cfg.DestinationPath, cfg.Method)
|
||||
r.logger.Printf("[INFO] Cross-drive backup starting: %s → %s (rsync)",
|
||||
stackName, cfg.DestinationPath)
|
||||
|
||||
// Trigger fresh DB dump for this app before cross-drive backup
|
||||
if r.dbDumper != nil {
|
||||
@@ -130,15 +130,7 @@ func (r *CrossDriveRunner) RunAppBackup(ctx context.Context, stackName string) e
|
||||
}
|
||||
}
|
||||
|
||||
var runErr error
|
||||
switch cfg.Method {
|
||||
case "rsync":
|
||||
runErr = r.runRsyncBackup(ctx, stackName, cfg.DestinationPath, mounts)
|
||||
case "restic":
|
||||
runErr = r.runResticBackup(ctx, stackName, cfg.DestinationPath, mounts)
|
||||
default:
|
||||
runErr = fmt.Errorf("unknown backup method: %s", cfg.Method)
|
||||
}
|
||||
runErr := r.runRsyncBackup(ctx, stackName, cfg.DestinationPath, mounts)
|
||||
|
||||
duration := time.Since(start)
|
||||
|
||||
@@ -150,11 +142,9 @@ func (r *CrossDriveRunner) RunAppBackup(ctx context.Context, stackName string) e
|
||||
|
||||
// Calculate backup size
|
||||
var sizeHuman string
|
||||
if cfg.Method == "rsync" {
|
||||
destDir := AppSecondaryRsyncPath(cfg.DestinationPath, stackName)
|
||||
if sz, err := dirSizeBytes(destDir); err == nil {
|
||||
sizeHuman = humanizeBytes(sz)
|
||||
}
|
||||
destDir := AppSecondaryRsyncPath(cfg.DestinationPath, stackName)
|
||||
if sz, err := dirSizeBytes(destDir); err == nil {
|
||||
sizeHuman = humanizeBytes(sz)
|
||||
}
|
||||
|
||||
r.logger.Printf("[INFO] Cross-drive backup completed: %s (%s)", stackName, duration.Round(time.Second))
|
||||
@@ -209,6 +199,18 @@ func (r *CrossDriveRunner) IsRunning(stackName string) bool {
|
||||
return r.running[stackName]
|
||||
}
|
||||
|
||||
// AnyRunning returns true if any cross-drive backup is currently in progress.
|
||||
func (r *CrossDriveRunner) AnyRunning() bool {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
for _, running := range r.running {
|
||||
if running {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// ValidateDestination checks that the destination path exists, is writable,
|
||||
// and has sufficient free space. System-drive destinations get stricter limits
|
||||
// (≥10 GB free, <90% used) to protect OS stability; external drives just need
|
||||
@@ -217,6 +219,9 @@ func (r *CrossDriveRunner) ValidateDestination(path string) error {
|
||||
if path == "" {
|
||||
return fmt.Errorf("destination path is empty")
|
||||
}
|
||||
if r.sett.IsDecommissioned(path) {
|
||||
return fmt.Errorf("destination %s is decommissioned — choose an active drive", path)
|
||||
}
|
||||
if _, err := os.Stat(path); os.IsNotExist(err) {
|
||||
return fmt.Errorf("destination %s does not exist", path)
|
||||
}
|
||||
@@ -326,108 +331,6 @@ func (r *CrossDriveRunner) runRsyncBackup(ctx context.Context, stackName, destBa
|
||||
return nil
|
||||
}
|
||||
|
||||
// --- restic ---
|
||||
|
||||
func (r *CrossDriveRunner) runResticBackup(ctx context.Context, stackName, destBase string, mounts []string) error {
|
||||
repoPath := SecondaryResticRepoPath(destBase)
|
||||
|
||||
// Get or create the cross-drive restic password
|
||||
password, err := r.sett.GetOrCreateCrossDrivePassword()
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting restic password: %w", err)
|
||||
}
|
||||
|
||||
// H6: Write password to temp file with safe cleanup order (close before deferred remove).
|
||||
pwFile, err := os.CreateTemp("", "felhom-crossdrive-pw-*")
|
||||
if err != nil {
|
||||
return fmt.Errorf("creating password file: %w", err)
|
||||
}
|
||||
pwPath := pwFile.Name()
|
||||
if _, err := pwFile.WriteString(password); err != nil {
|
||||
pwFile.Close()
|
||||
os.Remove(pwPath)
|
||||
return fmt.Errorf("writing password file: %w", err)
|
||||
}
|
||||
pwFile.Close()
|
||||
defer os.Remove(pwPath)
|
||||
|
||||
// Ensure repo is initialized
|
||||
if err := r.ensureResticRepo(ctx, repoPath, pwPath); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Run restic backup
|
||||
args := []string{
|
||||
"backup", "--repo", repoPath,
|
||||
"--password-file", pwPath,
|
||||
"--tag", stackName,
|
||||
"--tag", "cross-drive",
|
||||
}
|
||||
// Include user data (HDD mounts)
|
||||
args = append(args, mounts...)
|
||||
// Include app config dir (compose + app.yaml + .felhom.yml)
|
||||
if composePath, ok := r.stackProvider.GetStackComposePath(stackName); ok {
|
||||
args = append(args, filepath.Dir(composePath))
|
||||
}
|
||||
// Include DB dump dir for this app (from its home drive)
|
||||
appDrive := r.getAppDrivePath(stackName)
|
||||
dumpDir := AppDBDumpPath(appDrive, stackName)
|
||||
if _, err := os.Stat(dumpDir); err == nil {
|
||||
args = append(args, dumpDir)
|
||||
}
|
||||
|
||||
// Include infrastructure paths (same as primary restic)
|
||||
args = append(args, r.stacksDir)
|
||||
if _, err := os.Stat(r.controllerYAMLPath); err == nil {
|
||||
args = append(args, r.controllerYAMLPath)
|
||||
}
|
||||
|
||||
cmd := exec.CommandContext(ctx, "restic", args...)
|
||||
r.logger.Printf("[DEBUG] restic backup: %v", args)
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("restic backup failed: %v (%s)", err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
|
||||
// H5: Prune old snapshots to prevent unbounded accumulation.
|
||||
return r.pruneResticRepo(ctx, repoPath, pwPath)
|
||||
}
|
||||
|
||||
// pruneResticRepo forgets old snapshots in a cross-drive restic repo, keeping recent ones.
|
||||
func (r *CrossDriveRunner) pruneResticRepo(ctx context.Context, repoPath, pwPath string) error {
|
||||
args := []string{
|
||||
"forget", "--repo", repoPath,
|
||||
"--password-file", pwPath,
|
||||
"--keep-daily", "7",
|
||||
"--keep-weekly", "4",
|
||||
"--prune",
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, "restic", args...)
|
||||
r.logger.Printf("[DEBUG] restic forget (prune): %s", repoPath)
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
// Non-fatal: log warning but don't fail the backup
|
||||
r.logger.Printf("[WARN] restic forget failed for %s: %v (%s)", repoPath, err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *CrossDriveRunner) ensureResticRepo(ctx context.Context, repoPath, pwFile string) error {
|
||||
// Check if repo config exists
|
||||
if _, err := os.Stat(filepath.Join(repoPath, "config")); err == nil {
|
||||
return nil // already initialized
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(repoPath, 0755); err != nil {
|
||||
return fmt.Errorf("creating restic repo dir: %w", err)
|
||||
}
|
||||
|
||||
cmd := exec.CommandContext(ctx, "restic", "init", "--repo", repoPath, "--password-file", pwFile)
|
||||
r.logger.Printf("[INFO] Initializing cross-drive restic repo at %s", repoPath)
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("restic init failed: %v (%s)", err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// copyStackDBDumps copies DB dump files for the given stack from its home drive.
|
||||
// DB dumps are at <drive>/backups/primary/<stack>/db-dumps/<stack>_<dbtype>.sql.
|
||||
func (r *CrossDriveRunner) copyStackDBDumps(stackName, destDir string) error {
|
||||
@@ -537,11 +440,11 @@ func (r *CrossDriveRunner) AutoEnableSmallApps() {
|
||||
continue
|
||||
}
|
||||
|
||||
// Find destination: first storage path that differs from the app's home drive
|
||||
// Find destination: first active storage path that differs from the app's home drive
|
||||
appDrive := r.getAppDrivePath(stack.Name)
|
||||
var destPath string
|
||||
for _, sp := range storagePaths {
|
||||
if sp.Path != appDrive {
|
||||
if sp.Path != appDrive && !sp.Disconnected && !sp.Decommissioned {
|
||||
destPath = sp.Path
|
||||
break
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user