v0.6.0: healthcheck + hub reporting implementation
- Add heartbeat ping (every 5 min, controller alive signal) - Add backup integrity check (weekly restic check, Sunday 04:00) - Add Heartbeat + BackupIntegrity fields to PingUUIDsConfig - Add HubConfig for central hub reporting - Add report package (types, builder, pusher) for hub push - Wire hub reporting into scheduler (configurable interval) - Update controller.yaml.example with new monitoring + hub sections - Add monitoring/DEPRECATED.md for legacy bash scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,7 @@ import (
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/metrics"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/report"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/scheduler"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/stacks"
|
||||
catalogsync "gitea.dooplex.hu/admin/felhom-controller/internal/sync"
|
||||
@@ -118,16 +119,22 @@ func main() {
|
||||
return stackMgr.ScanStacks()
|
||||
})
|
||||
|
||||
// Heartbeat — lightweight "I'm alive" signal
|
||||
sched.Every("heartbeat", 5*time.Minute, func(ctx context.Context) error {
|
||||
pinger.Ping(cfg.Monitoring.PingUUIDs.Heartbeat, "")
|
||||
return nil
|
||||
})
|
||||
|
||||
// System health ping
|
||||
healthInterval, err := time.ParseDuration(cfg.Monitoring.SystemHealthInterval)
|
||||
if err != nil {
|
||||
healthInterval = 5 * time.Minute
|
||||
}
|
||||
sched.Every("system-health", healthInterval, func(ctx context.Context) error {
|
||||
report := monitor.RunHealthCheck(cfg, cpuCollector)
|
||||
body := report.FormatMessage()
|
||||
healthReport := monitor.RunHealthCheck(cfg, cpuCollector)
|
||||
body := healthReport.FormatMessage()
|
||||
healthUUID := cfg.Monitoring.PingUUIDs.SystemHealth
|
||||
if report.Status == "fail" {
|
||||
if healthReport.Status == "fail" {
|
||||
pinger.Fail(healthUUID, body)
|
||||
} else {
|
||||
pinger.Ping(healthUUID, body)
|
||||
@@ -144,6 +151,14 @@ func main() {
|
||||
return backupMgr.RunBackup(ctx)
|
||||
})
|
||||
|
||||
// Weekly integrity check — Sunday 04:00
|
||||
sched.Daily("backup-integrity", "04:00", func(ctx context.Context) error {
|
||||
if time.Now().Weekday() != time.Sunday {
|
||||
return nil
|
||||
}
|
||||
return backupMgr.RunIntegrityCheck(ctx)
|
||||
})
|
||||
|
||||
// Cache refresh: every 5 minutes
|
||||
sched.Every("backup-cache", 5*time.Minute, func(ctx context.Context) error {
|
||||
nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule)
|
||||
@@ -165,6 +180,20 @@ func main() {
|
||||
})
|
||||
}
|
||||
|
||||
// --- Central hub reporting ---
|
||||
if cfg.Hub.Enabled && cfg.Hub.URL != "" {
|
||||
pushInterval, err := time.ParseDuration(cfg.Hub.PushInterval)
|
||||
if err != nil {
|
||||
pushInterval = 15 * time.Minute
|
||||
}
|
||||
pusher := report.NewPusher(&cfg.Hub, logger)
|
||||
sched.Every("hub-report", pushInterval, func(ctx context.Context) error {
|
||||
r := report.BuildReport(cfg, stackMgr, backupMgr, cpuCollector, metricsStore, Version)
|
||||
return pusher.Push(r)
|
||||
})
|
||||
logger.Printf("[INFO] Hub reporting enabled (every %s to %s)", pushInterval, cfg.Hub.URL)
|
||||
}
|
||||
|
||||
sched.Start(ctx)
|
||||
defer sched.Stop()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user