97074e7a0c
- Add heartbeat ping (every 5 min, controller alive signal) - Add backup integrity check (weekly restic check, Sunday 04:00) - Add Heartbeat + BackupIntegrity fields to PingUUIDsConfig - Add HubConfig for central hub reporting - Add report package (types, builder, pusher) for hub push - Wire hub reporting into scheduler (configurable interval) - Update controller.yaml.example with new monitoring + hub sections - Add monitoring/DEPRECATED.md for legacy bash scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
540 lines
15 KiB
Go
540 lines
15 KiB
Go
package backup
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
|
|
)
|
|
|
|
// Manager orchestrates database dumps and restic backups.
|
|
type Manager struct {
|
|
cfg *config.Config
|
|
restic *ResticManager
|
|
logger *log.Logger
|
|
pinger *monitor.Pinger
|
|
|
|
mu sync.Mutex
|
|
lastDBDump *DBDumpStatus
|
|
lastBackup *BackupStatus
|
|
running bool
|
|
snapshotHistory []SnapshotRecord // ring buffer, last 20 entries
|
|
lastCheckTime time.Time
|
|
lastCheckOK bool
|
|
|
|
// Cached status for page rendering (refreshed periodically)
|
|
cachedStatus *FullBackupStatus
|
|
cacheTime time.Time
|
|
|
|
// AfterBackup is called after a backup completes to refresh the cache.
|
|
// Set by main.go to avoid circular import with scheduler.
|
|
AfterBackup func()
|
|
}
|
|
|
|
// SnapshotRecord combines restic snapshot metadata with our run stats.
|
|
type SnapshotRecord struct {
|
|
SnapshotID string `json:"snapshot_id"`
|
|
Time time.Time `json:"time"`
|
|
FilesNew int `json:"files_new"`
|
|
FilesChanged int `json:"files_changed"`
|
|
DataAdded string `json:"data_added"`
|
|
Duration time.Duration `json:"duration"`
|
|
Success bool `json:"success"`
|
|
HasStats bool `json:"has_stats"` // false for historical entries loaded from restic
|
|
}
|
|
|
|
// FullBackupStatus contains everything the backup page needs.
|
|
type FullBackupStatus struct {
|
|
Enabled bool
|
|
Running bool
|
|
|
|
// DB Dumps
|
|
LastDBDump *DBDumpStatus
|
|
DumpFiles []DumpFileInfo
|
|
DiscoveredDBs []DiscoveredDB
|
|
|
|
// Restic
|
|
LastBackup *BackupStatus
|
|
SnapshotHistory []SnapshotRecord
|
|
RepoStats *RepoStats
|
|
|
|
// Schedule
|
|
DBDumpSchedule string
|
|
ResticSchedule string
|
|
PruneSchedule string
|
|
NextDBDump time.Time
|
|
NextBackup time.Time
|
|
Retention config.RetentionConfig
|
|
|
|
// Repository health
|
|
RepoPath string
|
|
BackupPaths []string
|
|
LastCheckTime time.Time
|
|
LastCheckOK bool
|
|
|
|
// Remote (placeholder)
|
|
RemoteEnabled bool
|
|
}
|
|
|
|
// DBDumpStatus holds the last DB dump result.
|
|
type DBDumpStatus struct {
|
|
LastRun time.Time
|
|
Results []DumpResult
|
|
Success bool
|
|
Duration time.Duration
|
|
}
|
|
|
|
// BackupStatus holds the last backup result.
|
|
type BackupStatus struct {
|
|
LastRun time.Time
|
|
Snapshot *SnapshotResult
|
|
Success bool
|
|
Duration time.Duration
|
|
RepoStats *RepoStats
|
|
}
|
|
|
|
// NewManager creates a new backup manager.
|
|
func NewManager(cfg *config.Config, pinger *monitor.Pinger, logger *log.Logger) *Manager {
|
|
return &Manager{
|
|
cfg: cfg,
|
|
restic: NewResticManager(cfg, logger),
|
|
logger: logger,
|
|
pinger: pinger,
|
|
}
|
|
}
|
|
|
|
// RunDBDumps discovers and dumps all databases.
|
|
func (m *Manager) RunDBDumps(ctx context.Context) error {
|
|
start := time.Now()
|
|
m.logger.Printf("[INFO] Starting database dump run")
|
|
|
|
dbs, err := DiscoverDatabases(ctx, m.logger)
|
|
if err != nil {
|
|
m.logger.Printf("[ERROR] Database discovery failed: %v", err)
|
|
return err
|
|
}
|
|
|
|
if len(dbs) == 0 {
|
|
m.logger.Printf("[INFO] No database containers found")
|
|
m.mu.Lock()
|
|
m.lastDBDump = &DBDumpStatus{
|
|
LastRun: time.Now(),
|
|
Success: true,
|
|
Duration: time.Since(start),
|
|
}
|
|
m.mu.Unlock()
|
|
return nil
|
|
}
|
|
|
|
m.logger.Printf("[INFO] Discovered %d database(s): %s", len(dbs), dbNames(dbs))
|
|
|
|
results := DumpAll(ctx, dbs, m.cfg.Paths.DBDumpDir, m.logger)
|
|
|
|
// Check results
|
|
allOK := true
|
|
var summary []string
|
|
var totalSize int64
|
|
for _, r := range results {
|
|
if r.Error != nil {
|
|
allOK = false
|
|
summary = append(summary, fmt.Sprintf("FAIL %s: %v", r.DB.ContainerName, r.Error))
|
|
m.logger.Printf("[ERROR] DB dump failed for %s: %v", r.DB.ContainerName, r.Error)
|
|
} else {
|
|
totalSize += r.Size
|
|
summary = append(summary, fmt.Sprintf("OK %s (%s)", r.DB.ContainerName, formatBytes(r.Size)))
|
|
}
|
|
}
|
|
|
|
duration := time.Since(start)
|
|
m.mu.Lock()
|
|
m.lastDBDump = &DBDumpStatus{
|
|
LastRun: time.Now(),
|
|
Results: results,
|
|
Success: allOK,
|
|
Duration: duration,
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
// Ping healthcheck
|
|
uuid := m.cfg.Monitoring.PingUUIDs.DBDump
|
|
body := fmt.Sprintf("DB dump: %d databases, %s total\n%s",
|
|
len(results), formatBytes(totalSize), strings.Join(summary, "\n"))
|
|
|
|
if allOK {
|
|
m.pinger.Ping(uuid, body)
|
|
m.logger.Printf("[INFO] DB dump completed: %d databases, %s total (%s)",
|
|
len(results), formatBytes(totalSize), duration.Round(time.Millisecond))
|
|
} else {
|
|
m.pinger.Fail(uuid, body)
|
|
return fmt.Errorf("some database dumps failed")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// RunBackup runs a restic backup snapshot.
|
|
func (m *Manager) RunBackup(ctx context.Context) error {
|
|
start := time.Now()
|
|
m.logger.Printf("[INFO] Starting restic backup")
|
|
|
|
// Ensure repo is initialized
|
|
if err := m.restic.EnsureInitialized(); err != nil {
|
|
m.logger.Printf("[ERROR] Restic init failed: %v", err)
|
|
m.pinger.Fail(m.cfg.Monitoring.PingUUIDs.Backup, fmt.Sprintf("Restic init failed: %v", err))
|
|
return err
|
|
}
|
|
|
|
// Backup paths
|
|
paths := []string{
|
|
m.cfg.Paths.StacksDir,
|
|
m.cfg.Paths.DBDumpDir,
|
|
"/opt/docker/felhom-controller/controller.yaml",
|
|
}
|
|
tags := []string{"felhom", m.cfg.Customer.ID}
|
|
|
|
result, err := m.restic.Snapshot(paths, tags)
|
|
if err != nil {
|
|
m.logger.Printf("[ERROR] Restic backup failed: %v", err)
|
|
m.pinger.Fail(m.cfg.Monitoring.PingUUIDs.Backup, fmt.Sprintf("Backup failed: %v", err))
|
|
|
|
m.mu.Lock()
|
|
m.lastBackup = &BackupStatus{
|
|
LastRun: time.Now(),
|
|
Success: false,
|
|
Duration: time.Since(start),
|
|
}
|
|
m.mu.Unlock()
|
|
return err
|
|
}
|
|
|
|
// Prune check (weekly — Sunday)
|
|
if shouldPrune(m.cfg.Backup.PruneSchedule) {
|
|
m.logger.Printf("[INFO] Running weekly prune")
|
|
if err := m.restic.Prune(m.cfg.Backup.Retention); err != nil {
|
|
m.logger.Printf("[WARN] Restic prune failed: %v", err)
|
|
}
|
|
checkErr := m.restic.Check()
|
|
if checkErr != nil {
|
|
m.logger.Printf("[WARN] Restic check failed: %v", checkErr)
|
|
}
|
|
m.mu.Lock()
|
|
m.lastCheckTime = time.Now()
|
|
m.lastCheckOK = checkErr == nil
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
// Get stats
|
|
stats, _ := m.restic.Stats()
|
|
|
|
duration := time.Since(start)
|
|
m.mu.Lock()
|
|
m.lastBackup = &BackupStatus{
|
|
LastRun: time.Now(),
|
|
Snapshot: result,
|
|
Success: true,
|
|
Duration: duration,
|
|
RepoStats: stats,
|
|
}
|
|
// Append to snapshot history
|
|
m.appendSnapshotRecord(SnapshotRecord{
|
|
SnapshotID: result.SnapshotID,
|
|
Time: time.Now(),
|
|
FilesNew: result.FilesNew,
|
|
FilesChanged: result.FilesChanged,
|
|
DataAdded: result.DataAdded,
|
|
Duration: duration,
|
|
Success: true,
|
|
HasStats: true,
|
|
})
|
|
m.mu.Unlock()
|
|
|
|
body := fmt.Sprintf("Backup OK\nSnapshot: %s\nNew files: %d, Changed: %d\nData added: %s\nDuration: %s",
|
|
result.SnapshotID, result.FilesNew, result.FilesChanged, result.DataAdded,
|
|
duration.Round(time.Second))
|
|
m.pinger.Ping(m.cfg.Monitoring.PingUUIDs.Backup, body)
|
|
|
|
m.logger.Printf("[INFO] Restic backup completed: snapshot %s, %d new, %d changed, %s added (%s)",
|
|
result.SnapshotID, result.FilesNew, result.FilesChanged, result.DataAdded,
|
|
duration.Round(time.Millisecond))
|
|
|
|
// Refresh cache so the page shows updated data immediately
|
|
if m.AfterBackup != nil {
|
|
m.AfterBackup()
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// RunIntegrityCheck runs restic check and pings healthchecks with the result.
|
|
func (m *Manager) RunIntegrityCheck(ctx context.Context) error {
|
|
m.logger.Printf("[INFO] Starting restic integrity check")
|
|
start := time.Now()
|
|
|
|
if err := m.restic.EnsureInitialized(); err != nil {
|
|
m.logger.Printf("[ERROR] Restic init failed for integrity check: %v", err)
|
|
return err
|
|
}
|
|
|
|
err := m.restic.Check()
|
|
duration := time.Since(start)
|
|
|
|
uuid := m.cfg.Monitoring.PingUUIDs.BackupIntegrity
|
|
|
|
m.mu.Lock()
|
|
m.lastCheckTime = time.Now()
|
|
m.lastCheckOK = err == nil
|
|
m.mu.Unlock()
|
|
|
|
if err != nil {
|
|
m.logger.Printf("[ERROR] Restic integrity check failed (%s): %v", duration.Round(time.Second), err)
|
|
m.pinger.Fail(uuid, fmt.Sprintf("restic check failed: %v", err))
|
|
return err
|
|
}
|
|
|
|
m.logger.Printf("[INFO] Restic integrity check passed (%s)", duration.Round(time.Second))
|
|
m.pinger.Ping(uuid, fmt.Sprintf("restic check passed (%s)", duration.Round(time.Second)))
|
|
return nil
|
|
}
|
|
|
|
// RunFullBackup runs DB dumps followed by restic backup.
|
|
func (m *Manager) RunFullBackup(ctx context.Context) error {
|
|
m.mu.Lock()
|
|
if m.running {
|
|
m.mu.Unlock()
|
|
return fmt.Errorf("backup already in progress")
|
|
}
|
|
m.running = true
|
|
m.mu.Unlock()
|
|
|
|
defer func() {
|
|
m.mu.Lock()
|
|
m.running = false
|
|
m.mu.Unlock()
|
|
}()
|
|
|
|
// Step 1: DB dumps
|
|
if err := m.RunDBDumps(ctx); err != nil {
|
|
m.logger.Printf("[WARN] DB dump had errors, continuing with backup anyway")
|
|
}
|
|
|
|
// Step 2: Restic backup
|
|
return m.RunBackup(ctx)
|
|
}
|
|
|
|
// GetStatus returns the current backup status.
|
|
func (m *Manager) GetStatus() (*DBDumpStatus, *BackupStatus) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
return m.lastDBDump, m.lastBackup
|
|
}
|
|
|
|
// GetRepoStats returns repository statistics.
|
|
func (m *Manager) GetRepoStats() (*RepoStats, error) {
|
|
return m.restic.Stats()
|
|
}
|
|
|
|
// IsRunning returns whether a backup is currently in progress.
|
|
func (m *Manager) IsRunning() bool {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
return m.running
|
|
}
|
|
|
|
func shouldPrune(schedule string) bool {
|
|
loc, err := time.LoadLocation("Europe/Budapest")
|
|
if err != nil {
|
|
loc = time.UTC
|
|
}
|
|
now := time.Now().In(loc)
|
|
|
|
switch strings.ToLower(schedule) {
|
|
case "weekly":
|
|
return now.Weekday() == time.Sunday
|
|
case "daily":
|
|
return true
|
|
default:
|
|
return now.Weekday() == time.Sunday
|
|
}
|
|
}
|
|
|
|
// appendSnapshotRecord adds a record to the ring buffer (max 20). Caller must hold m.mu.
|
|
func (m *Manager) appendSnapshotRecord(rec SnapshotRecord) {
|
|
m.snapshotHistory = append(m.snapshotHistory, rec)
|
|
if len(m.snapshotHistory) > 20 {
|
|
m.snapshotHistory = m.snapshotHistory[len(m.snapshotHistory)-20:]
|
|
}
|
|
}
|
|
|
|
// LoadSnapshotHistory populates the snapshot history from restic on startup.
|
|
func (m *Manager) LoadSnapshotHistory() {
|
|
snapshots, err := m.restic.ListSnapshots(20)
|
|
if err != nil {
|
|
m.logger.Printf("[WARN] Could not load snapshot history: %v", err)
|
|
return
|
|
}
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
for _, s := range snapshots {
|
|
m.snapshotHistory = append(m.snapshotHistory, SnapshotRecord{
|
|
SnapshotID: s.ID,
|
|
Time: s.Time,
|
|
HasStats: false, // historical — no delta stats available
|
|
Success: true,
|
|
})
|
|
}
|
|
if len(m.snapshotHistory) > 20 {
|
|
m.snapshotHistory = m.snapshotHistory[len(m.snapshotHistory)-20:]
|
|
}
|
|
m.logger.Printf("[INFO] Loaded %d historical snapshots", len(m.snapshotHistory))
|
|
}
|
|
|
|
// RefreshCache updates the cached full status. Called by scheduler every 5 minutes
|
|
// and after each backup run.
|
|
func (m *Manager) RefreshCache(nextDBDump, nextBackup time.Time) {
|
|
status := &FullBackupStatus{
|
|
Enabled: m.cfg.Backup.Enabled,
|
|
|
|
DBDumpSchedule: m.cfg.Backup.DBDumpSchedule,
|
|
ResticSchedule: m.cfg.Backup.ResticSchedule,
|
|
PruneSchedule: m.cfg.Backup.PruneSchedule,
|
|
NextDBDump: nextDBDump,
|
|
NextBackup: nextBackup,
|
|
Retention: m.cfg.Backup.Retention,
|
|
|
|
RepoPath: m.cfg.Backup.ResticRepo,
|
|
BackupPaths: []string{
|
|
m.cfg.Paths.StacksDir,
|
|
m.cfg.Paths.DBDumpDir,
|
|
"/opt/docker/felhom-controller/controller.yaml",
|
|
},
|
|
}
|
|
|
|
// Expensive calls (outside lock)
|
|
if stats, err := m.restic.Stats(); err == nil {
|
|
status.RepoStats = stats
|
|
}
|
|
if files, err := ListDumpFiles(m.cfg.Paths.DBDumpDir); err == nil {
|
|
status.DumpFiles = files
|
|
}
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer cancel()
|
|
if dbs, err := DiscoverDatabases(ctx, m.logger); err == nil {
|
|
status.DiscoveredDBs = dbs
|
|
}
|
|
|
|
// Fill in dynamic fields under lock
|
|
m.mu.Lock()
|
|
status.Running = m.running
|
|
status.LastDBDump = m.lastDBDump
|
|
status.LastBackup = m.lastBackup
|
|
status.LastCheckTime = m.lastCheckTime
|
|
status.LastCheckOK = m.lastCheckOK
|
|
status.SnapshotHistory = make([]SnapshotRecord, len(m.snapshotHistory))
|
|
copy(status.SnapshotHistory, m.snapshotHistory)
|
|
m.cachedStatus = status
|
|
m.cacheTime = time.Now()
|
|
m.mu.Unlock()
|
|
|
|
// Reverse so newest first
|
|
for i, j := 0, len(status.SnapshotHistory)-1; i < j; i, j = i+1, j-1 {
|
|
status.SnapshotHistory[i], status.SnapshotHistory[j] = status.SnapshotHistory[j], status.SnapshotHistory[i]
|
|
}
|
|
|
|
m.logger.Printf("[INFO] Backup status cache refreshed")
|
|
}
|
|
|
|
// GetFullStatus returns the cached backup status for page rendering.
|
|
// Returns instantly — no subprocess calls.
|
|
func (m *Manager) GetFullStatus(nextDBDump, nextBackup time.Time) *FullBackupStatus {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if m.cachedStatus != nil {
|
|
// Update dynamic fields that don't need subprocess calls
|
|
m.cachedStatus.Running = m.running
|
|
m.cachedStatus.NextDBDump = nextDBDump
|
|
m.cachedStatus.NextBackup = nextBackup
|
|
m.cachedStatus.LastDBDump = m.lastDBDump
|
|
m.cachedStatus.LastBackup = m.lastBackup
|
|
// Update snapshot history
|
|
m.cachedStatus.SnapshotHistory = make([]SnapshotRecord, len(m.snapshotHistory))
|
|
copy(m.cachedStatus.SnapshotHistory, m.snapshotHistory)
|
|
// Reverse so newest first
|
|
for i, j := 0, len(m.cachedStatus.SnapshotHistory)-1; i < j; i, j = i+1, j-1 {
|
|
m.cachedStatus.SnapshotHistory[i], m.cachedStatus.SnapshotHistory[j] = m.cachedStatus.SnapshotHistory[j], m.cachedStatus.SnapshotHistory[i]
|
|
}
|
|
|
|
// Synthesize LastBackup from snapshot history if not in memory (e.g., after restart)
|
|
if m.cachedStatus.LastBackup == nil && len(m.cachedStatus.SnapshotHistory) > 0 {
|
|
latest := m.cachedStatus.SnapshotHistory[0] // already reversed, newest first
|
|
m.cachedStatus.LastBackup = &BackupStatus{
|
|
LastRun: latest.Time,
|
|
Success: latest.Success,
|
|
Snapshot: &SnapshotResult{
|
|
SnapshotID: latest.SnapshotID,
|
|
},
|
|
}
|
|
}
|
|
|
|
// Synthesize LastDBDump from DumpFiles on disk if not in memory
|
|
if m.cachedStatus.LastDBDump == nil && len(m.cachedStatus.DumpFiles) > 0 {
|
|
var results []DumpResult
|
|
var latestTime time.Time
|
|
for _, f := range m.cachedStatus.DumpFiles {
|
|
results = append(results, DumpResult{
|
|
DB: DiscoveredDB{StackName: f.StackName, DBType: f.DBType, ContainerName: f.StackName},
|
|
FilePath: f.FileName,
|
|
Size: f.Size,
|
|
})
|
|
if f.ModTime.After(latestTime) {
|
|
latestTime = f.ModTime
|
|
}
|
|
}
|
|
m.cachedStatus.LastDBDump = &DBDumpStatus{
|
|
LastRun: latestTime,
|
|
Results: results,
|
|
Success: true,
|
|
}
|
|
}
|
|
|
|
return m.cachedStatus
|
|
}
|
|
|
|
// No cache yet — return a minimal status (first page load before cache is populated)
|
|
return &FullBackupStatus{
|
|
Enabled: m.cfg.Backup.Enabled,
|
|
Running: m.running,
|
|
LastDBDump: m.lastDBDump,
|
|
LastBackup: m.lastBackup,
|
|
DBDumpSchedule: m.cfg.Backup.DBDumpSchedule,
|
|
ResticSchedule: m.cfg.Backup.ResticSchedule,
|
|
PruneSchedule: m.cfg.Backup.PruneSchedule,
|
|
NextDBDump: nextDBDump,
|
|
NextBackup: nextBackup,
|
|
Retention: m.cfg.Backup.Retention,
|
|
RepoPath: m.cfg.Backup.ResticRepo,
|
|
LastCheckTime: m.lastCheckTime,
|
|
LastCheckOK: m.lastCheckOK,
|
|
BackupPaths: []string{
|
|
m.cfg.Paths.StacksDir,
|
|
m.cfg.Paths.DBDumpDir,
|
|
"/opt/docker/felhom-controller/controller.yaml",
|
|
},
|
|
}
|
|
}
|
|
|
|
func dbNames(dbs []DiscoveredDB) string {
|
|
var names []string
|
|
for _, db := range dbs {
|
|
names = append(names, fmt.Sprintf("%s(%s)", db.ContainerName, db.DBType))
|
|
}
|
|
return strings.Join(names, ", ")
|
|
}
|