v0.4.0: monitoring & backup — scheduler, CPU/temp metrics, healthchecks, restic backups
Phase 2 (Monitoring & Health): - Central job scheduler replacing ad-hoc goroutines (internal/scheduler) - CPU usage collector via /proc/stat background sampling (internal/system/cpu_linux.go) - Temperature reading from /sys/class/thermal + /host/sys (Docker mount) - Load average from /proc/loadavg - Healthchecks.io-compatible HTTP pinger (internal/monitor/pinger.go) - System health checks: disk, memory, CPU, temp, Docker, protected containers (internal/monitor/healthcheck.go) Phase 3 (Backups): - Database auto-discovery via docker ps + docker inspect (internal/backup/dbdump.go) - Database dumping via docker exec (pg_dump / mariadb-dump) with atomic writes - Restic backup integration with auto-password generation (internal/backup/restic.go) - Backup orchestrator: DB dumps + restic snapshots + weekly prune (internal/backup/backup.go) - Manual backup trigger via dashboard button and POST /api/backup/run Dashboard UI: - CPU usage bar with load average display - Temperature with colored indicator dot - Backup status card with last run time, DB count, repo stats - "Mentés most" button for manual backup trigger Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,263 @@
|
||||
package backup
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
|
||||
)
|
||||
|
||||
// Manager orchestrates database dumps and restic backups.
|
||||
type Manager struct {
|
||||
cfg *config.Config
|
||||
restic *ResticManager
|
||||
logger *log.Logger
|
||||
pinger *monitor.Pinger
|
||||
|
||||
mu sync.Mutex
|
||||
lastDBDump *DBDumpStatus
|
||||
lastBackup *BackupStatus
|
||||
running bool
|
||||
}
|
||||
|
||||
// DBDumpStatus holds the last DB dump result.
|
||||
type DBDumpStatus struct {
|
||||
LastRun time.Time
|
||||
Results []DumpResult
|
||||
Success bool
|
||||
Duration time.Duration
|
||||
}
|
||||
|
||||
// BackupStatus holds the last backup result.
|
||||
type BackupStatus struct {
|
||||
LastRun time.Time
|
||||
Snapshot *SnapshotResult
|
||||
Success bool
|
||||
Duration time.Duration
|
||||
RepoStats *RepoStats
|
||||
}
|
||||
|
||||
// NewManager creates a new backup manager.
|
||||
func NewManager(cfg *config.Config, pinger *monitor.Pinger, logger *log.Logger) *Manager {
|
||||
return &Manager{
|
||||
cfg: cfg,
|
||||
restic: NewResticManager(cfg, logger),
|
||||
logger: logger,
|
||||
pinger: pinger,
|
||||
}
|
||||
}
|
||||
|
||||
// RunDBDumps discovers and dumps all databases.
|
||||
func (m *Manager) RunDBDumps(ctx context.Context) error {
|
||||
start := time.Now()
|
||||
m.logger.Printf("[INFO] Starting database dump run")
|
||||
|
||||
dbs, err := DiscoverDatabases(ctx, m.logger)
|
||||
if err != nil {
|
||||
m.logger.Printf("[ERROR] Database discovery failed: %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
if len(dbs) == 0 {
|
||||
m.logger.Printf("[INFO] No database containers found")
|
||||
m.mu.Lock()
|
||||
m.lastDBDump = &DBDumpStatus{
|
||||
LastRun: time.Now(),
|
||||
Success: true,
|
||||
Duration: time.Since(start),
|
||||
}
|
||||
m.mu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
m.logger.Printf("[INFO] Discovered %d database(s): %s", len(dbs), dbNames(dbs))
|
||||
|
||||
results := DumpAll(ctx, dbs, m.cfg.Paths.DBDumpDir, m.logger)
|
||||
|
||||
// Check results
|
||||
allOK := true
|
||||
var summary []string
|
||||
var totalSize int64
|
||||
for _, r := range results {
|
||||
if r.Error != nil {
|
||||
allOK = false
|
||||
summary = append(summary, fmt.Sprintf("FAIL %s: %v", r.DB.ContainerName, r.Error))
|
||||
m.logger.Printf("[ERROR] DB dump failed for %s: %v", r.DB.ContainerName, r.Error)
|
||||
} else {
|
||||
totalSize += r.Size
|
||||
summary = append(summary, fmt.Sprintf("OK %s (%s)", r.DB.ContainerName, formatBytes(r.Size)))
|
||||
}
|
||||
}
|
||||
|
||||
duration := time.Since(start)
|
||||
m.mu.Lock()
|
||||
m.lastDBDump = &DBDumpStatus{
|
||||
LastRun: time.Now(),
|
||||
Results: results,
|
||||
Success: allOK,
|
||||
Duration: duration,
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
// Ping healthcheck
|
||||
uuid := m.cfg.Monitoring.PingUUIDs.DBDump
|
||||
body := fmt.Sprintf("DB dump: %d databases, %s total\n%s",
|
||||
len(results), formatBytes(totalSize), strings.Join(summary, "\n"))
|
||||
|
||||
if allOK {
|
||||
m.pinger.Ping(uuid, body)
|
||||
m.logger.Printf("[INFO] DB dump completed: %d databases, %s total (%s)",
|
||||
len(results), formatBytes(totalSize), duration.Round(time.Millisecond))
|
||||
} else {
|
||||
m.pinger.Fail(uuid, body)
|
||||
return fmt.Errorf("some database dumps failed")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// RunBackup runs a restic backup snapshot.
|
||||
func (m *Manager) RunBackup(ctx context.Context) error {
|
||||
start := time.Now()
|
||||
m.logger.Printf("[INFO] Starting restic backup")
|
||||
|
||||
// Ensure repo is initialized
|
||||
if err := m.restic.EnsureInitialized(); err != nil {
|
||||
m.logger.Printf("[ERROR] Restic init failed: %v", err)
|
||||
m.pinger.Fail(m.cfg.Monitoring.PingUUIDs.Backup, fmt.Sprintf("Restic init failed: %v", err))
|
||||
return err
|
||||
}
|
||||
|
||||
// Backup paths
|
||||
paths := []string{
|
||||
m.cfg.Paths.StacksDir,
|
||||
m.cfg.Paths.DBDumpDir,
|
||||
"/opt/docker/felhom-controller/controller.yaml",
|
||||
}
|
||||
tags := []string{"felhom", m.cfg.Customer.ID}
|
||||
|
||||
result, err := m.restic.Snapshot(paths, tags)
|
||||
if err != nil {
|
||||
m.logger.Printf("[ERROR] Restic backup failed: %v", err)
|
||||
m.pinger.Fail(m.cfg.Monitoring.PingUUIDs.Backup, fmt.Sprintf("Backup failed: %v", err))
|
||||
|
||||
m.mu.Lock()
|
||||
m.lastBackup = &BackupStatus{
|
||||
LastRun: time.Now(),
|
||||
Success: false,
|
||||
Duration: time.Since(start),
|
||||
}
|
||||
m.mu.Unlock()
|
||||
return err
|
||||
}
|
||||
|
||||
// Prune check (weekly — Sunday)
|
||||
if shouldPrune(m.cfg.Backup.PruneSchedule) {
|
||||
m.logger.Printf("[INFO] Running weekly prune")
|
||||
if err := m.restic.Prune(m.cfg.Backup.Retention); err != nil {
|
||||
m.logger.Printf("[WARN] Restic prune failed: %v", err)
|
||||
}
|
||||
if err := m.restic.Check(); err != nil {
|
||||
m.logger.Printf("[WARN] Restic check failed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Get stats
|
||||
stats, _ := m.restic.Stats()
|
||||
|
||||
duration := time.Since(start)
|
||||
m.mu.Lock()
|
||||
m.lastBackup = &BackupStatus{
|
||||
LastRun: time.Now(),
|
||||
Snapshot: result,
|
||||
Success: true,
|
||||
Duration: duration,
|
||||
RepoStats: stats,
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
body := fmt.Sprintf("Backup OK\nSnapshot: %s\nNew files: %d, Changed: %d\nData added: %s\nDuration: %s",
|
||||
result.SnapshotID, result.FilesNew, result.FilesChanged, result.DataAdded,
|
||||
duration.Round(time.Second))
|
||||
m.pinger.Ping(m.cfg.Monitoring.PingUUIDs.Backup, body)
|
||||
|
||||
m.logger.Printf("[INFO] Restic backup completed: snapshot %s, %d new, %d changed, %s added (%s)",
|
||||
result.SnapshotID, result.FilesNew, result.FilesChanged, result.DataAdded,
|
||||
duration.Round(time.Millisecond))
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// RunFullBackup runs DB dumps followed by restic backup.
|
||||
func (m *Manager) RunFullBackup(ctx context.Context) error {
|
||||
m.mu.Lock()
|
||||
if m.running {
|
||||
m.mu.Unlock()
|
||||
return fmt.Errorf("backup already in progress")
|
||||
}
|
||||
m.running = true
|
||||
m.mu.Unlock()
|
||||
|
||||
defer func() {
|
||||
m.mu.Lock()
|
||||
m.running = false
|
||||
m.mu.Unlock()
|
||||
}()
|
||||
|
||||
// Step 1: DB dumps
|
||||
if err := m.RunDBDumps(ctx); err != nil {
|
||||
m.logger.Printf("[WARN] DB dump had errors, continuing with backup anyway")
|
||||
}
|
||||
|
||||
// Step 2: Restic backup
|
||||
return m.RunBackup(ctx)
|
||||
}
|
||||
|
||||
// GetStatus returns the current backup status.
|
||||
func (m *Manager) GetStatus() (*DBDumpStatus, *BackupStatus) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
return m.lastDBDump, m.lastBackup
|
||||
}
|
||||
|
||||
// GetRepoStats returns repository statistics.
|
||||
func (m *Manager) GetRepoStats() (*RepoStats, error) {
|
||||
return m.restic.Stats()
|
||||
}
|
||||
|
||||
// IsRunning returns whether a backup is currently in progress.
|
||||
func (m *Manager) IsRunning() bool {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
return m.running
|
||||
}
|
||||
|
||||
func shouldPrune(schedule string) bool {
|
||||
loc, err := time.LoadLocation("Europe/Budapest")
|
||||
if err != nil {
|
||||
loc = time.UTC
|
||||
}
|
||||
now := time.Now().In(loc)
|
||||
|
||||
switch strings.ToLower(schedule) {
|
||||
case "weekly":
|
||||
return now.Weekday() == time.Sunday
|
||||
case "daily":
|
||||
return true
|
||||
default:
|
||||
return now.Weekday() == time.Sunday
|
||||
}
|
||||
}
|
||||
|
||||
func dbNames(dbs []DiscoveredDB) string {
|
||||
var names []string
|
||||
for _, db := range dbs {
|
||||
names = append(names, fmt.Sprintf("%s(%s)", db.ContainerName, db.DBType))
|
||||
}
|
||||
return strings.Join(names, ", ")
|
||||
}
|
||||
Reference in New Issue
Block a user