feat: Hub monitoring takeover — event system, dead man's switch, notifications (v0.3.0)

Replace external Healthchecks.io with Hub-native monitoring. New events
table + /api/v1/event endpoint for structured events from controllers.
Staleness checker (60s) detects unresponsive nodes. Backup deadline
checker (daily 05:00) catches missed backups. Notification dispatcher
sends operator (English) + customer (Hungarian) emails via Resend with
per-event cooldowns. Event timeline on customer page, dashboard badges.
Config form deprecates Monitoring UUIDs section.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-20 18:53:24 +01:00
parent b4cb92e09f
commit 3217cb4751
16 changed files with 1319 additions and 64 deletions
+87 -18
View File
@@ -13,6 +13,8 @@ import (
"time"
"gitea.dooplex.hu/admin/felhom-hub/internal/api"
"gitea.dooplex.hu/admin/felhom-hub/internal/monitor"
"gitea.dooplex.hu/admin/felhom-hub/internal/notify"
"gitea.dooplex.hu/admin/felhom-hub/internal/store"
"gitea.dooplex.hu/admin/felhom-hub/internal/web"
"gopkg.in/yaml.v3"
@@ -32,8 +34,10 @@ type Config struct {
ReportAPIKey string `yaml:"report_api_key"`
} `yaml:"api"`
Notifications struct {
ResendAPIKey string `yaml:"resend_api_key"`
FromEmail string `yaml:"from_email"`
ResendAPIKey string `yaml:"resend_api_key"`
FromEmail string `yaml:"from_email"`
OperatorEmail string `yaml:"operator_email"`
OperatorEnabled bool `yaml:"operator_enabled"`
} `yaml:"notifications"`
Retention struct {
MaxDays int `yaml:"max_days"`
@@ -119,6 +123,18 @@ func main() {
templateProvider = templateFetcher
}
apiHandler := api.New(dataStore, cfg.API.ReportAPIKey, cfg.Notifications.ResendAPIKey, cfg.Notifications.FromEmail, templateProvider, logger)
// Initialize notification dispatcher
dispatcher := notify.NewDispatcher(
dataStore,
cfg.Notifications.ResendAPIKey,
cfg.Notifications.FromEmail,
cfg.Notifications.OperatorEmail,
cfg.Notifications.OperatorEnabled,
logger,
)
apiHandler.SetDispatcher(dispatcher)
webServer := web.New(dataStore, cfg.Auth.PasswordHash, cfg.API.ReportAPIKey, staleThreshold, logger)
webServer.SetTemplateFetcher(templateFetcher)
@@ -127,6 +143,11 @@ func main() {
// Health check endpoint — bypasses auth (for k8s probes)
mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) {
if err := dataStore.Ping(); err != nil {
w.WriteHeader(http.StatusServiceUnavailable)
w.Write([]byte("db unhealthy"))
return
}
w.WriteHeader(http.StatusOK)
w.Write([]byte("ok"))
})
@@ -165,10 +186,34 @@ func main() {
}
webServer.SetVersionChecker(versionChecker)
// Prune on startup, then daily at configured time (default 04:30)
if cfg.Retention.MaxDays > 0 {
go pruneLoop(ctx, dataStore, cfg.Retention.MaxDays, logger)
pruneAll(dataStore, cfg.Retention.MaxDays, logger)
go scheduleDaily(ctx, "prune", cfg.Retention.PruneSchedule, func() {
pruneAll(dataStore, cfg.Retention.MaxDays, logger)
}, logger)
}
// Staleness checker — runs every 60s
stalenessChecker := monitor.NewStalenessChecker(dataStore, staleThreshold, dispatcher.ProcessEvent, logger)
go func() {
ticker := time.NewTicker(60 * time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
stalenessChecker.Check()
}
}
}()
// Backup deadline checker — runs daily at 05:00 Budapest
go scheduleDaily(ctx, "deadline-check", "05:00", func() {
monitor.CheckBackupDeadlines(dataStore, stalenessChecker, dispatcher.ProcessEvent, logger)
}, logger)
// Signal handling
sigCh := make(chan os.Signal, 1)
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
@@ -244,28 +289,52 @@ func loadConfig(path string, logger *log.Logger) *Config {
return cfg
}
func pruneLoop(ctx context.Context, s *store.Store, maxDays int, logger *log.Logger) {
// Prune once on startup
if deleted, err := s.Prune(maxDays); err != nil {
logger.Printf("[WARN] Prune failed: %v", err)
} else if deleted > 0 {
logger.Printf("[INFO] Pruned %d old report rows", deleted)
// scheduleDaily runs fn once daily at the given "HH:MM" time in Europe/Budapest.
// It blocks until ctx is cancelled.
func scheduleDaily(ctx context.Context, name, timeStr string, fn func(), logger *log.Logger) {
budapest, err := time.LoadLocation("Europe/Budapest")
if err != nil {
budapest = time.FixedZone("CET", 3600)
}
// Then daily
ticker := time.NewTicker(24 * time.Hour)
defer ticker.Stop()
hour, min := parseHM(timeStr)
for {
now := time.Now().In(budapest)
next := time.Date(now.Year(), now.Month(), now.Day(), hour, min, 0, 0, budapest)
if !next.After(now) {
next = next.Add(24 * time.Hour)
}
delay := time.Until(next)
logger.Printf("[INFO] %s: next run at %s (in %s)", name, next.Format("2006-01-02 15:04 MST"), delay.Round(time.Second))
select {
case <-ctx.Done():
return
case <-ticker.C:
if deleted, err := s.Prune(maxDays); err != nil {
logger.Printf("[WARN] Prune failed: %v", err)
} else if deleted > 0 {
logger.Printf("[INFO] Pruned %d old report rows", deleted)
}
case <-time.After(delay):
fn()
}
}
}
// parseHM parses "HH:MM" into hour and minute. Returns 0, 0 on invalid input.
func parseHM(s string) (int, int) {
var h, m int
if _, err := fmt.Sscanf(s, "%d:%d", &h, &m); err != nil {
return 0, 0
}
return h, m
}
func pruneAll(s *store.Store, maxDays int, logger *log.Logger) {
if deleted, err := s.Prune(maxDays); err != nil {
logger.Printf("[WARN] Prune reports failed: %v", err)
} else if deleted > 0 {
logger.Printf("[INFO] Pruned %d old report rows", deleted)
}
if deleted, err := s.PruneEvents(maxDays); err != nil {
logger.Printf("[WARN] Prune events failed: %v", err)
} else if deleted > 0 {
logger.Printf("[INFO] Pruned %d old event rows", deleted)
}
}