v0.4.0: monitoring & backup — scheduler, CPU/temp metrics, healthchecks, restic backups

Phase 2 (Monitoring & Health):
- Central job scheduler replacing ad-hoc goroutines (internal/scheduler)
- CPU usage collector via /proc/stat background sampling (internal/system/cpu_linux.go)
- Temperature reading from /sys/class/thermal + /host/sys (Docker mount)
- Load average from /proc/loadavg
- Healthchecks.io-compatible HTTP pinger (internal/monitor/pinger.go)
- System health checks: disk, memory, CPU, temp, Docker, protected containers (internal/monitor/healthcheck.go)

Phase 3 (Backups):
- Database auto-discovery via docker ps + docker inspect (internal/backup/dbdump.go)
- Database dumping via docker exec (pg_dump / mariadb-dump) with atomic writes
- Restic backup integration with auto-password generation (internal/backup/restic.go)
- Backup orchestrator: DB dumps + restic snapshots + weekly prune (internal/backup/backup.go)
- Manual backup trigger via dashboard button and POST /api/backup/run

Dashboard UI:
- CPU usage bar with load average display
- Temperature with colored indicator dot
- Backup status card with last run time, DB count, repo stats
- "Mentés most" button for manual backup trigger

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-15 11:17:10 +01:00
parent 8a988c5998
commit d32d9fb44b
21 changed files with 2060 additions and 82 deletions
+66 -45
View File
@@ -12,9 +12,13 @@ import (
"time"
"gitea.dooplex.hu/admin/felhom-controller/internal/api"
"gitea.dooplex.hu/admin/felhom-controller/internal/backup"
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
"gitea.dooplex.hu/admin/felhom-controller/internal/scheduler"
"gitea.dooplex.hu/admin/felhom-controller/internal/stacks"
catalogsync "gitea.dooplex.hu/admin/felhom-controller/internal/sync"
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
"gitea.dooplex.hu/admin/felhom-controller/internal/web"
)
@@ -61,11 +65,70 @@ func main() {
syncer.Start()
defer syncer.Stop()
// --- Graceful shutdown context ---
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// --- Start CPU collector ---
cpuCollector := system.NewCPUCollector(5 * time.Second)
cpuCollector.Start(ctx)
defer cpuCollector.Stop()
// --- Initialize health pinger ---
pinger := monitor.NewPinger(&cfg.Monitoring, logger)
// --- Initialize backup manager ---
var backupMgr *backup.Manager
if cfg.Backup.Enabled {
backupMgr = backup.NewManager(cfg, pinger, logger)
}
// --- Initialize scheduler ---
sched := scheduler.New(logger)
// Existing periodic tasks (migrated from ad-hoc goroutines)
sched.Every("status-refresh", 30*time.Second, func(ctx context.Context) error {
return stackMgr.RefreshStatus()
})
sched.Every("stack-scan", 2*time.Minute, func(ctx context.Context) error {
return stackMgr.ScanStacks()
})
// System health ping
healthInterval, err := time.ParseDuration(cfg.Monitoring.SystemHealthInterval)
if err != nil {
healthInterval = 5 * time.Minute
}
sched.Every("system-health", healthInterval, func(ctx context.Context) error {
report := monitor.RunHealthCheck(cfg, cpuCollector)
body := report.FormatMessage()
healthUUID := cfg.Monitoring.PingUUIDs.SystemHealth
if report.Status == "fail" {
pinger.Fail(healthUUID, body)
} else {
pinger.Ping(healthUUID, body)
}
return nil
})
// Backup daily jobs
if cfg.Backup.Enabled && backupMgr != nil {
sched.Daily("db-dump", cfg.Backup.DBDumpSchedule, func(ctx context.Context) error {
return backupMgr.RunDBDumps(ctx)
})
sched.Daily("backup", cfg.Backup.ResticSchedule, func(ctx context.Context) error {
return backupMgr.RunBackup(ctx)
})
}
sched.Start(ctx)
defer sched.Stop()
// --- Initialize API router ---
apiRouter := api.NewRouter(cfg, stackMgr, syncer, logger)
apiRouter := api.NewRouter(cfg, stackMgr, syncer, cpuCollector, backupMgr, logger)
// --- Initialize web server ---
webServer := web.NewServer(cfg, stackMgr, logger, Version)
webServer := web.NewServer(cfg, stackMgr, cpuCollector, backupMgr, logger, Version)
// --- Build HTTP mux ---
mux := http.NewServeMux()
@@ -86,10 +149,6 @@ func main() {
IdleTimeout: 120 * time.Second,
}
// --- Graceful shutdown ---
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
sigCh := make(chan os.Signal, 1)
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
@@ -106,44 +165,6 @@ func main() {
}
}()
// --- Start background tasks ---
// Periodic container status refresh (lightweight — just runs docker ps)
go func() {
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
if err := stackMgr.RefreshStatus(); err != nil {
logger.Printf("[WARN] Status refresh failed: %v", err)
}
}
}
}()
// Periodic stack scan (discovers new/removed stacks from disk)
// Runs less frequently since it reads the filesystem.
// This allows adding new stacks without restarting the controller.
go func() {
ticker := time.NewTicker(2 * time.Minute)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
if err := stackMgr.ScanStacks(); err != nil {
logger.Printf("[WARN] Periodic stack scan failed: %v", err)
} else {
logger.Printf("[DEBUG] Periodic stack scan completed")
}
}
}
}()
logger.Printf("[INFO] Web UI listening on %s", cfg.Web.Listen)
if err := server.ListenAndServe(); err != http.ErrServerClosed {
logger.Fatalf("[FATAL] HTTP server error: %v", err)
@@ -161,4 +182,4 @@ func setupLogger(cfg *config.Config) *log.Logger {
}
return logger
}
}