v0.4.0: monitoring & backup — scheduler, CPU/temp metrics, healthchecks, restic backups
Phase 2 (Monitoring & Health): - Central job scheduler replacing ad-hoc goroutines (internal/scheduler) - CPU usage collector via /proc/stat background sampling (internal/system/cpu_linux.go) - Temperature reading from /sys/class/thermal + /host/sys (Docker mount) - Load average from /proc/loadavg - Healthchecks.io-compatible HTTP pinger (internal/monitor/pinger.go) - System health checks: disk, memory, CPU, temp, Docker, protected containers (internal/monitor/healthcheck.go) Phase 3 (Backups): - Database auto-discovery via docker ps + docker inspect (internal/backup/dbdump.go) - Database dumping via docker exec (pg_dump / mariadb-dump) with atomic writes - Restic backup integration with auto-password generation (internal/backup/restic.go) - Backup orchestrator: DB dumps + restic snapshots + weekly prune (internal/backup/backup.go) - Manual backup trigger via dashboard button and POST /api/backup/run Dashboard UI: - CPU usage bar with load average display - Temperature with colored indicator dot - Backup status card with last run time, DB count, repo stats - "Mentés most" button for manual backup trigger Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -12,9 +12,13 @@ import (
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/api"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/backup"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/scheduler"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/stacks"
|
||||
catalogsync "gitea.dooplex.hu/admin/felhom-controller/internal/sync"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/web"
|
||||
)
|
||||
|
||||
@@ -61,11 +65,70 @@ func main() {
|
||||
syncer.Start()
|
||||
defer syncer.Stop()
|
||||
|
||||
// --- Graceful shutdown context ---
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// --- Start CPU collector ---
|
||||
cpuCollector := system.NewCPUCollector(5 * time.Second)
|
||||
cpuCollector.Start(ctx)
|
||||
defer cpuCollector.Stop()
|
||||
|
||||
// --- Initialize health pinger ---
|
||||
pinger := monitor.NewPinger(&cfg.Monitoring, logger)
|
||||
|
||||
// --- Initialize backup manager ---
|
||||
var backupMgr *backup.Manager
|
||||
if cfg.Backup.Enabled {
|
||||
backupMgr = backup.NewManager(cfg, pinger, logger)
|
||||
}
|
||||
|
||||
// --- Initialize scheduler ---
|
||||
sched := scheduler.New(logger)
|
||||
|
||||
// Existing periodic tasks (migrated from ad-hoc goroutines)
|
||||
sched.Every("status-refresh", 30*time.Second, func(ctx context.Context) error {
|
||||
return stackMgr.RefreshStatus()
|
||||
})
|
||||
sched.Every("stack-scan", 2*time.Minute, func(ctx context.Context) error {
|
||||
return stackMgr.ScanStacks()
|
||||
})
|
||||
|
||||
// System health ping
|
||||
healthInterval, err := time.ParseDuration(cfg.Monitoring.SystemHealthInterval)
|
||||
if err != nil {
|
||||
healthInterval = 5 * time.Minute
|
||||
}
|
||||
sched.Every("system-health", healthInterval, func(ctx context.Context) error {
|
||||
report := monitor.RunHealthCheck(cfg, cpuCollector)
|
||||
body := report.FormatMessage()
|
||||
healthUUID := cfg.Monitoring.PingUUIDs.SystemHealth
|
||||
if report.Status == "fail" {
|
||||
pinger.Fail(healthUUID, body)
|
||||
} else {
|
||||
pinger.Ping(healthUUID, body)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
// Backup daily jobs
|
||||
if cfg.Backup.Enabled && backupMgr != nil {
|
||||
sched.Daily("db-dump", cfg.Backup.DBDumpSchedule, func(ctx context.Context) error {
|
||||
return backupMgr.RunDBDumps(ctx)
|
||||
})
|
||||
sched.Daily("backup", cfg.Backup.ResticSchedule, func(ctx context.Context) error {
|
||||
return backupMgr.RunBackup(ctx)
|
||||
})
|
||||
}
|
||||
|
||||
sched.Start(ctx)
|
||||
defer sched.Stop()
|
||||
|
||||
// --- Initialize API router ---
|
||||
apiRouter := api.NewRouter(cfg, stackMgr, syncer, logger)
|
||||
apiRouter := api.NewRouter(cfg, stackMgr, syncer, cpuCollector, backupMgr, logger)
|
||||
|
||||
// --- Initialize web server ---
|
||||
webServer := web.NewServer(cfg, stackMgr, logger, Version)
|
||||
webServer := web.NewServer(cfg, stackMgr, cpuCollector, backupMgr, logger, Version)
|
||||
|
||||
// --- Build HTTP mux ---
|
||||
mux := http.NewServeMux()
|
||||
@@ -86,10 +149,6 @@ func main() {
|
||||
IdleTimeout: 120 * time.Second,
|
||||
}
|
||||
|
||||
// --- Graceful shutdown ---
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
sigCh := make(chan os.Signal, 1)
|
||||
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
|
||||
|
||||
@@ -106,44 +165,6 @@ func main() {
|
||||
}
|
||||
}()
|
||||
|
||||
// --- Start background tasks ---
|
||||
|
||||
// Periodic container status refresh (lightweight — just runs docker ps)
|
||||
go func() {
|
||||
ticker := time.NewTicker(30 * time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
if err := stackMgr.RefreshStatus(); err != nil {
|
||||
logger.Printf("[WARN] Status refresh failed: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Periodic stack scan (discovers new/removed stacks from disk)
|
||||
// Runs less frequently since it reads the filesystem.
|
||||
// This allows adding new stacks without restarting the controller.
|
||||
go func() {
|
||||
ticker := time.NewTicker(2 * time.Minute)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
if err := stackMgr.ScanStacks(); err != nil {
|
||||
logger.Printf("[WARN] Periodic stack scan failed: %v", err)
|
||||
} else {
|
||||
logger.Printf("[DEBUG] Periodic stack scan completed")
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
logger.Printf("[INFO] Web UI listening on %s", cfg.Web.Listen)
|
||||
if err := server.ListenAndServe(); err != http.ErrServerClosed {
|
||||
logger.Fatalf("[FATAL] HTTP server error: %v", err)
|
||||
@@ -161,4 +182,4 @@ func setupLogger(cfg *config.Config) *log.Logger {
|
||||
}
|
||||
|
||||
return logger
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user