Files
deploy-felhom-compose/controller/internal/backup/backup.go
T
admin 1de244646b v0.12.0 — Backup page overhaul: unified app rows, bug fixes, sequential chaining
Bug fixes:
- GetFullStatus() returns deep copy; CrossDriveSummary/UnconfiguredApps/CrossDriveWarnings
  are always nil in the copy so the handler builds them fresh (fixes duplicate-apps bug)
- Replace binary IsMountPoint check with tiered CheckBackupDestination() — path-not-exist,
  not-writable, system-drive (warning), disk >90-95% full; shown as warning vs critical
- Remove dead settingsAppBackupHandler / POST /settings/app-backup route (toggle wrote
  to settings.json but nothing consumed the flag)

Architecture:
- Unified per-app backup rows: new AppBackupRow struct + buildAppBackupRows() replaces
  the two old sections with expandable rows showing all 3 layers per app
- Sequential backup chaining: cross-drive runs immediately after restic (removed
  independent cross-drive-daily/cross-drive-weekly scheduler jobs)
- Deploy page: remove "Csak kézi indítás" schedule option; add weekly consistency note

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-17 17:56:28 +01:00

687 lines
20 KiB
Go

package backup
import (
"context"
"fmt"
"log"
"os"
"path/filepath"
"strings"
"sync"
"time"
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
"gitea.dooplex.hu/admin/felhom-controller/internal/settings"
)
// Manager orchestrates database dumps and restic backups.
type Manager struct {
cfg *config.Config
restic *ResticManager
logger *log.Logger
pinger *monitor.Pinger
settings *settings.Settings
stackProvider StackDataProvider
mu sync.Mutex
lastDBDump *DBDumpStatus
lastBackup *BackupStatus
running bool
snapshotHistory []SnapshotRecord // ring buffer, last 20 entries
lastCheckTime time.Time
lastCheckOK bool
// Cached status for page rendering (refreshed periodically)
cachedStatus *FullBackupStatus
cacheTime time.Time
// AfterBackup is called after a backup completes to refresh the cache.
// Set by main.go to avoid circular import with scheduler.
AfterBackup func()
}
// SnapshotRecord combines restic snapshot metadata with our run stats.
type SnapshotRecord struct {
SnapshotID string `json:"snapshot_id"`
Time time.Time `json:"time"`
FilesNew int `json:"files_new"`
FilesChanged int `json:"files_changed"`
DataAdded string `json:"data_added"`
Duration time.Duration `json:"duration"`
Success bool `json:"success"`
HasStats bool `json:"has_stats"` // false for historical entries loaded from restic
}
// CrossDriveSummaryItem holds display data for one app's cross-drive backup.
type CrossDriveSummaryItem struct {
StackName string
DisplayName string
Method string // "rsync" or "restic"
MethodLabel string // "Egyszerű másolat" or "Restic"
DestPath string
DestLabel string // storage path label
Schedule string
ScheduleLabel string // "Naponta" or "Hetente" or "Kézi"
LastStatus string // "ok", "error", "running", ""
LastRunShort string // formatted short time e.g. "03:15"
SizeHuman string
}
// FullBackupStatus contains everything the backup page needs.
type FullBackupStatus struct {
Enabled bool
Running bool
// DB Dumps
LastDBDump *DBDumpStatus
DumpFiles []DumpFileInfo
DiscoveredDBs []DiscoveredDB
// Restic
LastBackup *BackupStatus
SnapshotHistory []SnapshotRecord
RepoStats *RepoStats
// Schedule
DBDumpSchedule string
ResticSchedule string
PruneSchedule string
NextDBDump time.Time
NextBackup time.Time
Retention config.RetentionConfig
// Repository health
RepoPath string
BackupPaths []string
LastCheckTime time.Time
LastCheckOK bool
// Remote (placeholder)
RemoteEnabled bool
// App data backup
AppDataInfo []AppBackupInfo
// Cross-drive backup summary
CrossDriveSummary []CrossDriveSummaryItem
UnconfiguredApps []CrossDriveSummaryItem // apps with HDD data but no cross-drive config
CrossDriveWarnings []string // destination health warnings
// Flash messages (set by handlers, passed through redirect)
FlashSuccess string
FlashError string
}
// DBDumpStatus holds the last DB dump result.
type DBDumpStatus struct {
LastRun time.Time
Results []DumpResult
Success bool
Duration time.Duration
}
// BackupStatus holds the last backup result.
type BackupStatus struct {
LastRun time.Time
Snapshot *SnapshotResult
Success bool
Duration time.Duration
RepoStats *RepoStats
}
// NewManager creates a new backup manager.
func NewManager(cfg *config.Config, pinger *monitor.Pinger, sett *settings.Settings, logger *log.Logger) *Manager {
return &Manager{
cfg: cfg,
restic: NewResticManager(cfg, logger),
logger: logger,
pinger: pinger,
settings: sett,
}
}
// RunDBDumps discovers and dumps all databases.
func (m *Manager) RunDBDumps(ctx context.Context) error {
start := time.Now()
m.logger.Printf("[INFO] Starting database dump run")
dbs, err := DiscoverDatabases(ctx, m.logger)
if err != nil {
m.logger.Printf("[ERROR] Database discovery failed: %v", err)
return err
}
if len(dbs) == 0 {
m.logger.Printf("[INFO] No database containers found")
m.mu.Lock()
m.lastDBDump = &DBDumpStatus{
LastRun: time.Now(),
Success: true,
Duration: time.Since(start),
}
m.mu.Unlock()
return nil
}
m.logger.Printf("[INFO] Discovered %d database(s): %s", len(dbs), dbNames(dbs))
results := DumpAll(ctx, dbs, m.cfg.Paths.DBDumpDir, m.logger)
// Check results and persist validations
allOK := true
var summary []string
var totalSize int64
for _, r := range results {
if r.Error != nil {
allOK = false
summary = append(summary, fmt.Sprintf("FAIL %s: %v", r.DB.ContainerName, r.Error))
m.logger.Printf("[ERROR] DB dump failed for %s: %v", r.DB.ContainerName, r.Error)
} else {
totalSize += r.Size
summary = append(summary, fmt.Sprintf("OK %s (%s)", r.DB.ContainerName, formatBytes(r.Size)))
// Persist validation result to settings.json
if m.settings != nil && r.FilePath != "" {
filename := filepath.Base(r.FilePath)
cache := settings.DBValidationCache{
ValidatedAt: time.Now().Format(time.RFC3339),
TableCount: r.Validation.TableCount,
HasHeader: r.Validation.Valid,
}
if !r.Validation.Valid {
cache.Error = r.Validation.Error
}
if err := m.settings.SetDBValidation(filename, cache); err != nil {
m.logger.Printf("[WARN] Failed to cache validation for %s: %v", filename, err)
}
}
}
}
duration := time.Since(start)
m.mu.Lock()
m.lastDBDump = &DBDumpStatus{
LastRun: time.Now(),
Results: results,
Success: allOK,
Duration: duration,
}
m.mu.Unlock()
// Ping healthcheck
uuid := m.cfg.Monitoring.PingUUIDs.DBDump
body := fmt.Sprintf("DB dump: %d databases, %s total\n%s",
len(results), formatBytes(totalSize), strings.Join(summary, "\n"))
if allOK {
m.pinger.Ping(uuid, body)
m.logger.Printf("[INFO] DB dump completed: %d databases, %s total (%s)",
len(results), formatBytes(totalSize), duration.Round(time.Millisecond))
} else {
m.pinger.Fail(uuid, body)
return fmt.Errorf("some database dumps failed")
}
return nil
}
// RunBackup runs a restic backup snapshot.
func (m *Manager) RunBackup(ctx context.Context) error {
start := time.Now()
m.logger.Printf("[INFO] Starting restic backup")
// Ensure repo is initialized
if err := m.restic.EnsureInitialized(); err != nil {
m.logger.Printf("[ERROR] Restic init failed: %v", err)
m.pinger.Fail(m.cfg.Monitoring.PingUUIDs.Backup, fmt.Sprintf("Restic init failed: %v", err))
return err
}
// Backup paths: base + dynamic app data
paths := []string{
m.cfg.Paths.StacksDir,
m.cfg.Paths.DBDumpDir,
"/opt/docker/felhom-controller/controller.yaml",
}
appPaths := m.resolveAppBackupPaths()
if len(appPaths) > 0 {
paths = append(paths, appPaths...)
m.logger.Printf("[INFO] Backup paths (%d total, %d app data): %v", len(paths), len(appPaths), paths)
}
tags := []string{"felhom", m.cfg.Customer.ID}
result, err := m.restic.Snapshot(paths, tags)
if err != nil {
m.logger.Printf("[ERROR] Restic backup failed: %v", err)
m.pinger.Fail(m.cfg.Monitoring.PingUUIDs.Backup, fmt.Sprintf("Backup failed: %v", err))
m.mu.Lock()
m.lastBackup = &BackupStatus{
LastRun: time.Now(),
Success: false,
Duration: time.Since(start),
}
m.mu.Unlock()
return err
}
// Prune check (weekly — Sunday)
if shouldPrune(m.cfg.Backup.PruneSchedule) {
m.logger.Printf("[INFO] Running weekly prune")
if err := m.restic.Prune(m.cfg.Backup.Retention); err != nil {
m.logger.Printf("[WARN] Restic prune failed: %v", err)
}
checkErr := m.restic.Check()
if checkErr != nil {
m.logger.Printf("[WARN] Restic check failed: %v", checkErr)
}
m.mu.Lock()
m.lastCheckTime = time.Now()
m.lastCheckOK = checkErr == nil
m.mu.Unlock()
}
// Get stats
stats, _ := m.restic.Stats()
duration := time.Since(start)
m.mu.Lock()
m.lastBackup = &BackupStatus{
LastRun: time.Now(),
Snapshot: result,
Success: true,
Duration: duration,
RepoStats: stats,
}
// Append to snapshot history
m.appendSnapshotRecord(SnapshotRecord{
SnapshotID: result.SnapshotID,
Time: time.Now(),
FilesNew: result.FilesNew,
FilesChanged: result.FilesChanged,
DataAdded: result.DataAdded,
Duration: duration,
Success: true,
HasStats: true,
})
m.mu.Unlock()
body := fmt.Sprintf("Backup OK\nSnapshot: %s\nNew files: %d, Changed: %d\nData added: %s\nDuration: %s",
result.SnapshotID, result.FilesNew, result.FilesChanged, result.DataAdded,
duration.Round(time.Second))
m.pinger.Ping(m.cfg.Monitoring.PingUUIDs.Backup, body)
m.logger.Printf("[INFO] Restic backup completed: snapshot %s, %d new, %d changed, %s added (%s)",
result.SnapshotID, result.FilesNew, result.FilesChanged, result.DataAdded,
duration.Round(time.Millisecond))
// Refresh cache so the page shows updated data immediately
if m.AfterBackup != nil {
m.AfterBackup()
}
return nil
}
// RunIntegrityCheck runs restic check and pings healthchecks with the result.
func (m *Manager) RunIntegrityCheck(ctx context.Context) error {
m.logger.Printf("[INFO] Starting restic integrity check")
start := time.Now()
if err := m.restic.EnsureInitialized(); err != nil {
m.logger.Printf("[ERROR] Restic init failed for integrity check: %v", err)
return err
}
err := m.restic.Check()
duration := time.Since(start)
uuid := m.cfg.Monitoring.PingUUIDs.BackupIntegrity
m.mu.Lock()
m.lastCheckTime = time.Now()
m.lastCheckOK = err == nil
m.mu.Unlock()
if err != nil {
m.logger.Printf("[ERROR] Restic integrity check failed (%s): %v", duration.Round(time.Second), err)
m.pinger.Fail(uuid, fmt.Sprintf("restic check failed: %v", err))
return err
}
m.logger.Printf("[INFO] Restic integrity check passed (%s)", duration.Round(time.Second))
m.pinger.Ping(uuid, fmt.Sprintf("restic check passed (%s)", duration.Round(time.Second)))
return nil
}
// RunFullBackup runs DB dumps followed by restic backup.
func (m *Manager) RunFullBackup(ctx context.Context) error {
m.mu.Lock()
if m.running {
m.mu.Unlock()
return fmt.Errorf("backup already in progress")
}
m.running = true
m.mu.Unlock()
defer func() {
m.mu.Lock()
m.running = false
m.mu.Unlock()
}()
// Step 1: DB dumps
if err := m.RunDBDumps(ctx); err != nil {
m.logger.Printf("[WARN] DB dump had errors, continuing with backup anyway")
}
// Step 2: Restic backup
return m.RunBackup(ctx)
}
// GetStatus returns the current backup status.
func (m *Manager) GetStatus() (*DBDumpStatus, *BackupStatus) {
m.mu.Lock()
defer m.mu.Unlock()
return m.lastDBDump, m.lastBackup
}
// GetRepoStats returns repository statistics.
func (m *Manager) GetRepoStats() (*RepoStats, error) {
return m.restic.Stats()
}
// IsRunning returns whether a backup or restore is currently in progress.
func (m *Manager) IsRunning() bool {
m.mu.Lock()
defer m.mu.Unlock()
return m.running
}
// GetResticPassword returns the restic repository encryption password.
func (m *Manager) GetResticPassword() (string, error) {
return m.restic.GetPassword()
}
// ListSnapshots returns snapshots from the restic repository.
func (m *Manager) ListSnapshots(limit int) ([]SnapshotInfo, error) {
return m.restic.ListSnapshots(limit)
}
// SetStackProvider sets the stack data provider for app data discovery.
func (m *Manager) SetStackProvider(provider StackDataProvider) {
m.stackProvider = provider
}
// resolveAppBackupPaths returns HDD paths for all enabled app backups.
func (m *Manager) resolveAppBackupPaths() []string {
if m.stackProvider == nil || m.settings == nil {
return nil
}
appBackupMap := m.settings.GetAppBackupMap()
if len(appBackupMap) == 0 {
return nil
}
var paths []string
seen := make(map[string]bool)
for stackName, enabled := range appBackupMap {
if !enabled {
continue
}
hddMounts := m.stackProvider.GetStackHDDMounts(stackName)
for _, mount := range hddMounts {
if seen[mount] {
continue
}
if _, err := os.Stat(mount); err == nil {
paths = append(paths, mount)
seen[mount] = true
m.logger.Printf("[DEBUG] Including app data: %s (from %s)", mount, stackName)
}
}
}
return paths
}
func shouldPrune(schedule string) bool {
loc, err := time.LoadLocation("Europe/Budapest")
if err != nil {
loc = time.UTC
}
now := time.Now().In(loc)
switch strings.ToLower(schedule) {
case "weekly":
return now.Weekday() == time.Sunday
case "daily":
return true
default:
return now.Weekday() == time.Sunday
}
}
// appendSnapshotRecord adds a record to the ring buffer (max 20). Caller must hold m.mu.
func (m *Manager) appendSnapshotRecord(rec SnapshotRecord) {
m.snapshotHistory = append(m.snapshotHistory, rec)
if len(m.snapshotHistory) > 20 {
m.snapshotHistory = m.snapshotHistory[len(m.snapshotHistory)-20:]
}
}
// LoadSnapshotHistory populates the snapshot history from restic on startup.
func (m *Manager) LoadSnapshotHistory() {
snapshots, err := m.restic.ListSnapshots(20)
if err != nil {
m.logger.Printf("[WARN] Could not load snapshot history: %v", err)
return
}
m.mu.Lock()
defer m.mu.Unlock()
for _, s := range snapshots {
m.snapshotHistory = append(m.snapshotHistory, SnapshotRecord{
SnapshotID: s.ID,
Time: s.Time,
HasStats: false, // historical — no delta stats available
Success: true,
})
}
if len(m.snapshotHistory) > 20 {
m.snapshotHistory = m.snapshotHistory[len(m.snapshotHistory)-20:]
}
m.logger.Printf("[INFO] Loaded %d historical snapshots", len(m.snapshotHistory))
}
// RefreshCache updates the cached full status. Called by scheduler every 5 minutes
// and after each backup run.
func (m *Manager) RefreshCache(nextDBDump, nextBackup time.Time) {
status := &FullBackupStatus{
Enabled: m.cfg.Backup.Enabled,
DBDumpSchedule: m.cfg.Backup.DBDumpSchedule,
ResticSchedule: m.cfg.Backup.ResticSchedule,
PruneSchedule: m.cfg.Backup.PruneSchedule,
NextDBDump: nextDBDump,
NextBackup: nextBackup,
Retention: m.cfg.Backup.Retention,
RepoPath: m.cfg.Backup.ResticRepo,
BackupPaths: []string{
m.cfg.Paths.StacksDir,
m.cfg.Paths.DBDumpDir,
"/opt/docker/felhom-controller/controller.yaml",
},
}
// Expensive calls (outside lock)
if stats, err := m.restic.Stats(); err == nil {
status.RepoStats = stats
}
files, filesErr := ListDumpFiles(m.cfg.Paths.DBDumpDir)
if filesErr == nil {
status.DumpFiles = files
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if dbs, err := DiscoverDatabases(ctx, m.logger); err == nil {
status.DiscoveredDBs = dbs
}
// Discover app data (for per-app backup toggles)
if m.stackProvider != nil {
backupPrefs := m.settings.GetAppBackupMap()
status.AppDataInfo = DiscoverAppData(m.stackProvider, backupPrefs, status.DiscoveredDBs)
// Include enabled app backup paths in the displayed BackupPaths
appPaths := m.resolveAppBackupPaths()
if len(appPaths) > 0 {
status.BackupPaths = append(status.BackupPaths, appPaths...)
}
}
// Cross-check: if LastDBDump results have empty validation but files exist,
// re-validate from disk. This handles controller restarts and race conditions.
if m.lastDBDump != nil && filesErr == nil {
fileValidation := make(map[string]DumpValidation) // keyed by filename
for _, f := range files {
fileValidation[f.FileName] = f.Validation
}
for i, r := range m.lastDBDump.Results {
if !r.Validation.Valid && r.Validation.Error == "" && r.FilePath != "" {
filename := filepath.Base(r.FilePath)
if fv, ok := fileValidation[filename]; ok {
m.lastDBDump.Results[i].Validation = fv
m.logger.Printf("[INFO] Re-validated %s from disk: valid=%v tables=%d",
filename, fv.Valid, fv.TableCount)
}
}
}
}
// Fill in dynamic fields under lock
m.mu.Lock()
status.Running = m.running
status.LastDBDump = m.lastDBDump
status.LastBackup = m.lastBackup
status.LastCheckTime = m.lastCheckTime
status.LastCheckOK = m.lastCheckOK
status.SnapshotHistory = make([]SnapshotRecord, len(m.snapshotHistory))
copy(status.SnapshotHistory, m.snapshotHistory)
m.cachedStatus = status
m.cacheTime = time.Now()
m.mu.Unlock()
// Reverse so newest first
for i, j := 0, len(status.SnapshotHistory)-1; i < j; i, j = i+1, j-1 {
status.SnapshotHistory[i], status.SnapshotHistory[j] = status.SnapshotHistory[j], status.SnapshotHistory[i]
}
m.logger.Printf("[INFO] Backup status cache refreshed")
}
// GetFullStatus returns the cached backup status for page rendering.
// Returns instantly — no subprocess calls.
// Returns a deep copy so callers can safely append to slice fields without
// polluting the cache (which would cause duplicate entries on repeated calls).
func (m *Manager) GetFullStatus(nextDBDump, nextBackup time.Time) *FullBackupStatus {
m.mu.Lock()
defer m.mu.Unlock()
if m.cachedStatus != nil {
// Deep copy — callers (backupsHandler) append to CrossDriveSummary,
// UnconfiguredApps, and CrossDriveWarnings. If we returned the cache
// pointer directly, every page load would accumulate more entries.
status := *m.cachedStatus
status.AppDataInfo = make([]AppBackupInfo, len(m.cachedStatus.AppDataInfo))
copy(status.AppDataInfo, m.cachedStatus.AppDataInfo)
// These three slices are assembled by the handler from AppDataInfo + settings;
// they must always start empty so the handler builds them fresh.
status.CrossDriveSummary = nil
status.UnconfiguredApps = nil
status.CrossDriveWarnings = nil
// Update dynamic fields that don't need subprocess calls
status.Running = m.running
status.NextDBDump = nextDBDump
status.NextBackup = nextBackup
status.LastDBDump = m.lastDBDump
status.LastBackup = m.lastBackup
// Update snapshot history
status.SnapshotHistory = make([]SnapshotRecord, len(m.snapshotHistory))
copy(status.SnapshotHistory, m.snapshotHistory)
// Reverse so newest first
for i, j := 0, len(status.SnapshotHistory)-1; i < j; i, j = i+1, j-1 {
status.SnapshotHistory[i], status.SnapshotHistory[j] = status.SnapshotHistory[j], status.SnapshotHistory[i]
}
// Synthesize LastBackup from snapshot history if not in memory (e.g., after restart)
if status.LastBackup == nil && len(status.SnapshotHistory) > 0 {
latest := status.SnapshotHistory[0] // already reversed, newest first
status.LastBackup = &BackupStatus{
LastRun: latest.Time,
Success: latest.Success,
Snapshot: &SnapshotResult{
SnapshotID: latest.SnapshotID,
},
}
}
// Synthesize LastDBDump from DumpFiles on disk if not in memory
if status.LastDBDump == nil && len(status.DumpFiles) > 0 {
var results []DumpResult
var latestTime time.Time
for _, f := range status.DumpFiles {
results = append(results, DumpResult{
DB: DiscoveredDB{StackName: f.StackName, DBType: f.DBType, ContainerName: f.StackName},
FilePath: f.FileName,
Size: f.Size,
})
if f.ModTime.After(latestTime) {
latestTime = f.ModTime
}
}
status.LastDBDump = &DBDumpStatus{
LastRun: latestTime,
Results: results,
Success: true,
}
}
return &status
}
// No cache yet — return a minimal status (first page load before cache is populated)
return &FullBackupStatus{
Enabled: m.cfg.Backup.Enabled,
Running: m.running,
LastDBDump: m.lastDBDump,
LastBackup: m.lastBackup,
DBDumpSchedule: m.cfg.Backup.DBDumpSchedule,
ResticSchedule: m.cfg.Backup.ResticSchedule,
PruneSchedule: m.cfg.Backup.PruneSchedule,
NextDBDump: nextDBDump,
NextBackup: nextBackup,
Retention: m.cfg.Backup.Retention,
RepoPath: m.cfg.Backup.ResticRepo,
LastCheckTime: m.lastCheckTime,
LastCheckOK: m.lastCheckOK,
BackupPaths: []string{
m.cfg.Paths.StacksDir,
m.cfg.Paths.DBDumpDir,
"/opt/docker/felhom-controller/controller.yaml",
},
}
}
func dbNames(dbs []DiscoveredDB) string {
var names []string
for _, db := range dbs {
names = append(names, fmt.Sprintf("%s(%s)", db.ContainerName, db.DBType))
}
return strings.Join(names, ", ")
}