feat: Hub monitoring takeover — event system, dead man's switch, notifications (v0.3.0)
Replace external Healthchecks.io with Hub-native monitoring. New events table + /api/v1/event endpoint for structured events from controllers. Staleness checker (60s) detects unresponsive nodes. Backup deadline checker (daily 05:00) catches missed backups. Notification dispatcher sends operator (English) + customer (Hungarian) emails via Resend with per-event cooldowns. Event timeline on customer page, dashboard badges. Config form deprecates Monitoring UUIDs section. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+87
-18
@@ -13,6 +13,8 @@ import (
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-hub/internal/api"
|
||||
"gitea.dooplex.hu/admin/felhom-hub/internal/monitor"
|
||||
"gitea.dooplex.hu/admin/felhom-hub/internal/notify"
|
||||
"gitea.dooplex.hu/admin/felhom-hub/internal/store"
|
||||
"gitea.dooplex.hu/admin/felhom-hub/internal/web"
|
||||
"gopkg.in/yaml.v3"
|
||||
@@ -32,8 +34,10 @@ type Config struct {
|
||||
ReportAPIKey string `yaml:"report_api_key"`
|
||||
} `yaml:"api"`
|
||||
Notifications struct {
|
||||
ResendAPIKey string `yaml:"resend_api_key"`
|
||||
FromEmail string `yaml:"from_email"`
|
||||
ResendAPIKey string `yaml:"resend_api_key"`
|
||||
FromEmail string `yaml:"from_email"`
|
||||
OperatorEmail string `yaml:"operator_email"`
|
||||
OperatorEnabled bool `yaml:"operator_enabled"`
|
||||
} `yaml:"notifications"`
|
||||
Retention struct {
|
||||
MaxDays int `yaml:"max_days"`
|
||||
@@ -119,6 +123,18 @@ func main() {
|
||||
templateProvider = templateFetcher
|
||||
}
|
||||
apiHandler := api.New(dataStore, cfg.API.ReportAPIKey, cfg.Notifications.ResendAPIKey, cfg.Notifications.FromEmail, templateProvider, logger)
|
||||
|
||||
// Initialize notification dispatcher
|
||||
dispatcher := notify.NewDispatcher(
|
||||
dataStore,
|
||||
cfg.Notifications.ResendAPIKey,
|
||||
cfg.Notifications.FromEmail,
|
||||
cfg.Notifications.OperatorEmail,
|
||||
cfg.Notifications.OperatorEnabled,
|
||||
logger,
|
||||
)
|
||||
apiHandler.SetDispatcher(dispatcher)
|
||||
|
||||
webServer := web.New(dataStore, cfg.Auth.PasswordHash, cfg.API.ReportAPIKey, staleThreshold, logger)
|
||||
webServer.SetTemplateFetcher(templateFetcher)
|
||||
|
||||
@@ -127,6 +143,11 @@ func main() {
|
||||
|
||||
// Health check endpoint — bypasses auth (for k8s probes)
|
||||
mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) {
|
||||
if err := dataStore.Ping(); err != nil {
|
||||
w.WriteHeader(http.StatusServiceUnavailable)
|
||||
w.Write([]byte("db unhealthy"))
|
||||
return
|
||||
}
|
||||
w.WriteHeader(http.StatusOK)
|
||||
w.Write([]byte("ok"))
|
||||
})
|
||||
@@ -165,10 +186,34 @@ func main() {
|
||||
}
|
||||
webServer.SetVersionChecker(versionChecker)
|
||||
|
||||
// Prune on startup, then daily at configured time (default 04:30)
|
||||
if cfg.Retention.MaxDays > 0 {
|
||||
go pruneLoop(ctx, dataStore, cfg.Retention.MaxDays, logger)
|
||||
pruneAll(dataStore, cfg.Retention.MaxDays, logger)
|
||||
go scheduleDaily(ctx, "prune", cfg.Retention.PruneSchedule, func() {
|
||||
pruneAll(dataStore, cfg.Retention.MaxDays, logger)
|
||||
}, logger)
|
||||
}
|
||||
|
||||
// Staleness checker — runs every 60s
|
||||
stalenessChecker := monitor.NewStalenessChecker(dataStore, staleThreshold, dispatcher.ProcessEvent, logger)
|
||||
go func() {
|
||||
ticker := time.NewTicker(60 * time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
stalenessChecker.Check()
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
// Backup deadline checker — runs daily at 05:00 Budapest
|
||||
go scheduleDaily(ctx, "deadline-check", "05:00", func() {
|
||||
monitor.CheckBackupDeadlines(dataStore, stalenessChecker, dispatcher.ProcessEvent, logger)
|
||||
}, logger)
|
||||
|
||||
// Signal handling
|
||||
sigCh := make(chan os.Signal, 1)
|
||||
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
|
||||
@@ -244,28 +289,52 @@ func loadConfig(path string, logger *log.Logger) *Config {
|
||||
return cfg
|
||||
}
|
||||
|
||||
func pruneLoop(ctx context.Context, s *store.Store, maxDays int, logger *log.Logger) {
|
||||
// Prune once on startup
|
||||
if deleted, err := s.Prune(maxDays); err != nil {
|
||||
logger.Printf("[WARN] Prune failed: %v", err)
|
||||
} else if deleted > 0 {
|
||||
logger.Printf("[INFO] Pruned %d old report rows", deleted)
|
||||
// scheduleDaily runs fn once daily at the given "HH:MM" time in Europe/Budapest.
|
||||
// It blocks until ctx is cancelled.
|
||||
func scheduleDaily(ctx context.Context, name, timeStr string, fn func(), logger *log.Logger) {
|
||||
budapest, err := time.LoadLocation("Europe/Budapest")
|
||||
if err != nil {
|
||||
budapest = time.FixedZone("CET", 3600)
|
||||
}
|
||||
|
||||
// Then daily
|
||||
ticker := time.NewTicker(24 * time.Hour)
|
||||
defer ticker.Stop()
|
||||
hour, min := parseHM(timeStr)
|
||||
|
||||
for {
|
||||
now := time.Now().In(budapest)
|
||||
next := time.Date(now.Year(), now.Month(), now.Day(), hour, min, 0, 0, budapest)
|
||||
if !next.After(now) {
|
||||
next = next.Add(24 * time.Hour)
|
||||
}
|
||||
delay := time.Until(next)
|
||||
logger.Printf("[INFO] %s: next run at %s (in %s)", name, next.Format("2006-01-02 15:04 MST"), delay.Round(time.Second))
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
if deleted, err := s.Prune(maxDays); err != nil {
|
||||
logger.Printf("[WARN] Prune failed: %v", err)
|
||||
} else if deleted > 0 {
|
||||
logger.Printf("[INFO] Pruned %d old report rows", deleted)
|
||||
}
|
||||
case <-time.After(delay):
|
||||
fn()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// parseHM parses "HH:MM" into hour and minute. Returns 0, 0 on invalid input.
|
||||
func parseHM(s string) (int, int) {
|
||||
var h, m int
|
||||
if _, err := fmt.Sscanf(s, "%d:%d", &h, &m); err != nil {
|
||||
return 0, 0
|
||||
}
|
||||
return h, m
|
||||
}
|
||||
|
||||
func pruneAll(s *store.Store, maxDays int, logger *log.Logger) {
|
||||
if deleted, err := s.Prune(maxDays); err != nil {
|
||||
logger.Printf("[WARN] Prune reports failed: %v", err)
|
||||
} else if deleted > 0 {
|
||||
logger.Printf("[INFO] Pruned %d old report rows", deleted)
|
||||
}
|
||||
if deleted, err := s.PruneEvents(maxDays); err != nil {
|
||||
logger.Printf("[WARN] Prune events failed: %v", err)
|
||||
} else if deleted > 0 {
|
||||
logger.Printf("[INFO] Pruned %d old event rows", deleted)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user