v0.4.0: monitoring & backup — scheduler, CPU/temp metrics, healthchecks, restic backups

Phase 2 (Monitoring & Health):
- Central job scheduler replacing ad-hoc goroutines (internal/scheduler)
- CPU usage collector via /proc/stat background sampling (internal/system/cpu_linux.go)
- Temperature reading from /sys/class/thermal + /host/sys (Docker mount)
- Load average from /proc/loadavg
- Healthchecks.io-compatible HTTP pinger (internal/monitor/pinger.go)
- System health checks: disk, memory, CPU, temp, Docker, protected containers (internal/monitor/healthcheck.go)

Phase 3 (Backups):
- Database auto-discovery via docker ps + docker inspect (internal/backup/dbdump.go)
- Database dumping via docker exec (pg_dump / mariadb-dump) with atomic writes
- Restic backup integration with auto-password generation (internal/backup/restic.go)
- Backup orchestrator: DB dumps + restic snapshots + weekly prune (internal/backup/backup.go)
- Manual backup trigger via dashboard button and POST /api/backup/run

Dashboard UI:
- CPU usage bar with load average display
- Temperature with colored indicator dot
- Backup status card with last run time, DB count, repo stats
- "Mentés most" button for manual backup trigger

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-15 11:17:10 +01:00
parent 8a988c5998
commit d32d9fb44b
21 changed files with 2060 additions and 82 deletions
+263
View File
@@ -0,0 +1,263 @@
package backup
import (
"context"
"fmt"
"log"
"strings"
"sync"
"time"
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
)
// Manager orchestrates database dumps and restic backups.
type Manager struct {
cfg *config.Config
restic *ResticManager
logger *log.Logger
pinger *monitor.Pinger
mu sync.Mutex
lastDBDump *DBDumpStatus
lastBackup *BackupStatus
running bool
}
// DBDumpStatus holds the last DB dump result.
type DBDumpStatus struct {
LastRun time.Time
Results []DumpResult
Success bool
Duration time.Duration
}
// BackupStatus holds the last backup result.
type BackupStatus struct {
LastRun time.Time
Snapshot *SnapshotResult
Success bool
Duration time.Duration
RepoStats *RepoStats
}
// NewManager creates a new backup manager.
func NewManager(cfg *config.Config, pinger *monitor.Pinger, logger *log.Logger) *Manager {
return &Manager{
cfg: cfg,
restic: NewResticManager(cfg, logger),
logger: logger,
pinger: pinger,
}
}
// RunDBDumps discovers and dumps all databases.
func (m *Manager) RunDBDumps(ctx context.Context) error {
start := time.Now()
m.logger.Printf("[INFO] Starting database dump run")
dbs, err := DiscoverDatabases(ctx, m.logger)
if err != nil {
m.logger.Printf("[ERROR] Database discovery failed: %v", err)
return err
}
if len(dbs) == 0 {
m.logger.Printf("[INFO] No database containers found")
m.mu.Lock()
m.lastDBDump = &DBDumpStatus{
LastRun: time.Now(),
Success: true,
Duration: time.Since(start),
}
m.mu.Unlock()
return nil
}
m.logger.Printf("[INFO] Discovered %d database(s): %s", len(dbs), dbNames(dbs))
results := DumpAll(ctx, dbs, m.cfg.Paths.DBDumpDir, m.logger)
// Check results
allOK := true
var summary []string
var totalSize int64
for _, r := range results {
if r.Error != nil {
allOK = false
summary = append(summary, fmt.Sprintf("FAIL %s: %v", r.DB.ContainerName, r.Error))
m.logger.Printf("[ERROR] DB dump failed for %s: %v", r.DB.ContainerName, r.Error)
} else {
totalSize += r.Size
summary = append(summary, fmt.Sprintf("OK %s (%s)", r.DB.ContainerName, formatBytes(r.Size)))
}
}
duration := time.Since(start)
m.mu.Lock()
m.lastDBDump = &DBDumpStatus{
LastRun: time.Now(),
Results: results,
Success: allOK,
Duration: duration,
}
m.mu.Unlock()
// Ping healthcheck
uuid := m.cfg.Monitoring.PingUUIDs.DBDump
body := fmt.Sprintf("DB dump: %d databases, %s total\n%s",
len(results), formatBytes(totalSize), strings.Join(summary, "\n"))
if allOK {
m.pinger.Ping(uuid, body)
m.logger.Printf("[INFO] DB dump completed: %d databases, %s total (%s)",
len(results), formatBytes(totalSize), duration.Round(time.Millisecond))
} else {
m.pinger.Fail(uuid, body)
return fmt.Errorf("some database dumps failed")
}
return nil
}
// RunBackup runs a restic backup snapshot.
func (m *Manager) RunBackup(ctx context.Context) error {
start := time.Now()
m.logger.Printf("[INFO] Starting restic backup")
// Ensure repo is initialized
if err := m.restic.EnsureInitialized(); err != nil {
m.logger.Printf("[ERROR] Restic init failed: %v", err)
m.pinger.Fail(m.cfg.Monitoring.PingUUIDs.Backup, fmt.Sprintf("Restic init failed: %v", err))
return err
}
// Backup paths
paths := []string{
m.cfg.Paths.StacksDir,
m.cfg.Paths.DBDumpDir,
"/opt/docker/felhom-controller/controller.yaml",
}
tags := []string{"felhom", m.cfg.Customer.ID}
result, err := m.restic.Snapshot(paths, tags)
if err != nil {
m.logger.Printf("[ERROR] Restic backup failed: %v", err)
m.pinger.Fail(m.cfg.Monitoring.PingUUIDs.Backup, fmt.Sprintf("Backup failed: %v", err))
m.mu.Lock()
m.lastBackup = &BackupStatus{
LastRun: time.Now(),
Success: false,
Duration: time.Since(start),
}
m.mu.Unlock()
return err
}
// Prune check (weekly — Sunday)
if shouldPrune(m.cfg.Backup.PruneSchedule) {
m.logger.Printf("[INFO] Running weekly prune")
if err := m.restic.Prune(m.cfg.Backup.Retention); err != nil {
m.logger.Printf("[WARN] Restic prune failed: %v", err)
}
if err := m.restic.Check(); err != nil {
m.logger.Printf("[WARN] Restic check failed: %v", err)
}
}
// Get stats
stats, _ := m.restic.Stats()
duration := time.Since(start)
m.mu.Lock()
m.lastBackup = &BackupStatus{
LastRun: time.Now(),
Snapshot: result,
Success: true,
Duration: duration,
RepoStats: stats,
}
m.mu.Unlock()
body := fmt.Sprintf("Backup OK\nSnapshot: %s\nNew files: %d, Changed: %d\nData added: %s\nDuration: %s",
result.SnapshotID, result.FilesNew, result.FilesChanged, result.DataAdded,
duration.Round(time.Second))
m.pinger.Ping(m.cfg.Monitoring.PingUUIDs.Backup, body)
m.logger.Printf("[INFO] Restic backup completed: snapshot %s, %d new, %d changed, %s added (%s)",
result.SnapshotID, result.FilesNew, result.FilesChanged, result.DataAdded,
duration.Round(time.Millisecond))
return nil
}
// RunFullBackup runs DB dumps followed by restic backup.
func (m *Manager) RunFullBackup(ctx context.Context) error {
m.mu.Lock()
if m.running {
m.mu.Unlock()
return fmt.Errorf("backup already in progress")
}
m.running = true
m.mu.Unlock()
defer func() {
m.mu.Lock()
m.running = false
m.mu.Unlock()
}()
// Step 1: DB dumps
if err := m.RunDBDumps(ctx); err != nil {
m.logger.Printf("[WARN] DB dump had errors, continuing with backup anyway")
}
// Step 2: Restic backup
return m.RunBackup(ctx)
}
// GetStatus returns the current backup status.
func (m *Manager) GetStatus() (*DBDumpStatus, *BackupStatus) {
m.mu.Lock()
defer m.mu.Unlock()
return m.lastDBDump, m.lastBackup
}
// GetRepoStats returns repository statistics.
func (m *Manager) GetRepoStats() (*RepoStats, error) {
return m.restic.Stats()
}
// IsRunning returns whether a backup is currently in progress.
func (m *Manager) IsRunning() bool {
m.mu.Lock()
defer m.mu.Unlock()
return m.running
}
func shouldPrune(schedule string) bool {
loc, err := time.LoadLocation("Europe/Budapest")
if err != nil {
loc = time.UTC
}
now := time.Now().In(loc)
switch strings.ToLower(schedule) {
case "weekly":
return now.Weekday() == time.Sunday
case "daily":
return true
default:
return now.Weekday() == time.Sunday
}
}
func dbNames(dbs []DiscoveredDB) string {
var names []string
for _, db := range dbs {
names = append(names, fmt.Sprintf("%s(%s)", db.ContainerName, db.DBType))
}
return strings.Join(names, ", ")
}