feat: Hub monitoring takeover — event system, dead man's switch, notifications (v0.3.0)

Replace external Healthchecks.io with Hub-native monitoring. New events
table + /api/v1/event endpoint for structured events from controllers.
Staleness checker (60s) detects unresponsive nodes. Backup deadline
checker (daily 05:00) catches missed backups. Notification dispatcher
sends operator (English) + customer (Hungarian) emails via Resend with
per-event cooldowns. Event timeline on customer page, dashboard badges.
Config form deprecates Monitoring UUIDs section.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-20 18:53:24 +01:00
parent b4cb92e09f
commit 3217cb4751
16 changed files with 1319 additions and 64 deletions
+149 -11
View File
@@ -12,6 +12,7 @@ import (
"time"
"gitea.dooplex.hu/admin/felhom-hub/internal/configgen"
"gitea.dooplex.hu/admin/felhom-hub/internal/notify"
"gitea.dooplex.hu/admin/felhom-hub/internal/store"
)
@@ -29,6 +30,7 @@ type Handler struct {
logger *log.Logger
httpClient *http.Client
templateProvider ConfigTemplateProvider
dispatcher *notify.Dispatcher
}
// New creates a new API handler.
@@ -44,23 +46,40 @@ func New(store *store.Store, apiKey, resendAPIKey, fromEmail string, templatePro
}
}
// SetDispatcher sets the notification dispatcher for event-triggered emails.
func (h *Handler) SetDispatcher(d *notify.Dispatcher) {
h.dispatcher = d
}
// checkAuth verifies the Bearer token against the global API key or a per-customer API key.
// Returns true if authorized.
func (h *Handler) checkAuth(r *http.Request) bool {
_, _, ok := h.checkAuthCustomer(r)
return ok
}
// checkAuthCustomer verifies the Bearer token and returns the authenticated customer identity.
// For per-customer keys: returns (customerID, false, true).
// For global key: returns ("", true, true) — caller must allow any customer_id.
// On failure: returns ("", false, false).
func (h *Handler) checkAuthCustomer(r *http.Request) (customerID string, isGlobal bool, ok bool) {
auth := r.Header.Get("Authorization")
if !strings.HasPrefix(auth, "Bearer ") {
return false
return "", false, false
}
token := strings.TrimPrefix(auth, "Bearer ")
// Check global key first
if h.apiKey != "" && subtle.ConstantTimeCompare([]byte(token), []byte(h.apiKey)) == 1 {
return true
return "", true, true
}
// Check per-customer key
cfg, err := h.store.GetCustomerConfigByAPIKey(token)
return err == nil && cfg != nil
if err != nil || cfg == nil {
return "", false, false
}
return cfg.CustomerID, false, true
}
// ServeHTTP routes API requests.
@@ -70,6 +89,8 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodPost && path == "/report":
h.handleReport(w, r)
case r.Method == http.MethodPost && path == "/event":
h.handleEvent(w, r)
case r.Method == http.MethodPost && path == "/notify":
h.handleNotify(w, r)
case r.Method == http.MethodPost && path == "/infra-backup":
@@ -97,7 +118,8 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
}
func (h *Handler) handleReport(w http.ResponseWriter, r *http.Request) {
if !h.checkAuth(r) {
authCustomerID, isGlobal, ok := h.checkAuthCustomer(r)
if !ok {
http.Error(w, "Unauthorized", http.StatusUnauthorized)
return
}
@@ -117,6 +139,12 @@ func (h *Handler) handleReport(w http.ResponseWriter, r *http.Request) {
return
}
// Validate customer_id matches authenticated customer (unless global key)
if !isGlobal && authCustomerID != payload.CustomerID {
http.Error(w, "Forbidden: customer_id mismatch", http.StatusForbidden)
return
}
if err := h.store.SaveReport(payload.CustomerID, body); err != nil {
h.logger.Printf("[ERROR] Failed to save report from %s: %v", payload.CustomerID, err)
http.Error(w, "Internal error", http.StatusInternalServerError)
@@ -128,6 +156,114 @@ func (h *Handler) handleReport(w http.ResponseWriter, r *http.Request) {
w.Write([]byte(`{"status":"ok"}`))
}
// allowedEventTypes lists all valid event_type values the Hub accepts.
var allowedEventTypes = map[string]bool{
// Controller-pushed events
"controller_started": true,
"controller_updated": true,
"backup_completed": true,
"backup_failed": true,
"db_dump_completed": true,
"db_dump_failed": true,
"backup_integrity_ok": true,
"backup_integrity_failed": true,
"crossdrive_completed": true,
"crossdrive_failed": true,
"storage_disconnected": true,
"storage_reconnected": true,
"disk_warning": true,
"disk_critical": true,
"health_degraded": true,
"health_critical": true,
"health_recovered": true,
"app_deployed": true,
"app_removed": true,
"disaster_recovery_started": true,
"disaster_recovery_completed": true,
// Hub-generated events
"node_stale": true,
"node_down": true,
"node_recovered": true,
"expected_backup_missed": true,
"expected_dbdump_missed": true,
// Special
"test": true,
}
// handleEvent processes structured events from controllers (new endpoint, replaces /notify for updated controllers).
func (h *Handler) handleEvent(w http.ResponseWriter, r *http.Request) {
authCustomerID, isGlobal, ok := h.checkAuthCustomer(r)
if !ok {
http.Error(w, "Unauthorized", http.StatusUnauthorized)
return
}
body, err := io.ReadAll(io.LimitReader(r.Body, 1<<20))
if err != nil {
http.Error(w, "Bad request", http.StatusBadRequest)
return
}
var payload struct {
CustomerID string `json:"customer_id"`
EventType string `json:"event_type"`
Severity string `json:"severity"`
Message string `json:"message"`
Details json.RawMessage `json:"details"`
}
if err := json.Unmarshal(body, &payload); err != nil {
http.Error(w, "Invalid JSON", http.StatusBadRequest)
return
}
if payload.CustomerID == "" || payload.EventType == "" {
http.Error(w, "customer_id and event_type are required", http.StatusBadRequest)
return
}
// Validate customer_id matches authenticated customer (unless global key)
if !isGlobal && authCustomerID != payload.CustomerID {
http.Error(w, "Forbidden: customer_id mismatch", http.StatusForbidden)
return
}
// Validate event_type
if !allowedEventTypes[payload.EventType] {
http.Error(w, fmt.Sprintf("Invalid event_type: %s", payload.EventType), http.StatusBadRequest)
return
}
// Validate/default severity
switch payload.Severity {
case "info", "warning", "error":
default:
payload.Severity = "info"
}
// Store details as JSON string
detailsStr := "{}"
if len(payload.Details) > 0 && string(payload.Details) != "null" {
detailsStr = string(payload.Details)
}
_, err = h.store.SaveEvent(payload.CustomerID, payload.EventType, payload.Severity, payload.Message, detailsStr, "controller")
if err != nil {
h.logger.Printf("[ERROR] Failed to save event from %s: %v", payload.CustomerID, err)
http.Error(w, "Internal error", http.StatusInternalServerError)
return
}
h.logger.Printf("[INFO] Event from %s: %s (%s) — %s", payload.CustomerID, payload.EventType, payload.Severity, payload.Message)
// Dispatch notifications (non-blocking)
if h.dispatcher != nil {
go h.dispatcher.ProcessEvent(payload.CustomerID, payload.EventType, payload.Severity, payload.Message, detailsStr, "controller")
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
w.Write([]byte(`{"ok":true}`))
}
func (h *Handler) handleCustomers(w http.ResponseWriter, r *http.Request) {
customers, err := h.store.GetCustomers()
if err != nil {
@@ -258,7 +394,7 @@ func (h *Handler) handleNotify(w http.ResponseWriter, r *http.Request) {
// Check if customer is blocked
if h.store.IsCustomerBlocked(payload.CustomerID) {
h.logger.Printf("[INFO] Notification suppressed for blocked customer %s", payload.CustomerID)
h.store.LogNotification(payload.CustomerID, payload.EventType, payload.Severity, payload.Message, "skipped", "customer blocked")
h.store.LogNotification(payload.CustomerID, payload.EventType, payload.Severity, payload.Message, "skipped", "customer blocked", "customer")
w.WriteHeader(http.StatusOK)
w.Write([]byte(`{"status":"ok","sent":false,"reason":"blocked"}`))
return
@@ -275,7 +411,7 @@ func (h *Handler) handleNotify(w http.ResponseWriter, r *http.Request) {
// Check if customer has email configured and event type is enabled
if prefs == nil || prefs.Email == "" {
h.logger.Printf("[INFO] No email configured for %s, skipping notification", payload.CustomerID)
h.store.LogNotification(payload.CustomerID, payload.EventType, payload.Severity, payload.Message, "skipped", "no email configured")
h.store.LogNotification(payload.CustomerID, payload.EventType, payload.Severity, payload.Message, "skipped", "no email configured", "customer")
w.WriteHeader(http.StatusOK)
w.Write([]byte(`{"status":"ok","sent":false,"reason":"no_email"}`))
return
@@ -291,7 +427,7 @@ func (h *Handler) handleNotify(w http.ResponseWriter, r *http.Request) {
}
if !eventEnabled {
h.logger.Printf("[INFO] Event %s not enabled for %s, skipping", payload.EventType, payload.CustomerID)
h.store.LogNotification(payload.CustomerID, payload.EventType, payload.Severity, payload.Message, "skipped", "event not enabled")
h.store.LogNotification(payload.CustomerID, payload.EventType, payload.Severity, payload.Message, "skipped", "event not enabled", "customer")
w.WriteHeader(http.StatusOK)
w.Write([]byte(`{"status":"ok","sent":false,"reason":"event_disabled"}`))
return
@@ -300,7 +436,7 @@ func (h *Handler) handleNotify(w http.ResponseWriter, r *http.Request) {
// Send email via Resend API
if h.resendAPIKey == "" {
h.logger.Printf("[WARN] Resend API key not configured, cannot send notification email")
h.store.LogNotification(payload.CustomerID, payload.EventType, payload.Severity, payload.Message, "skipped", "resend api key not configured")
h.store.LogNotification(payload.CustomerID, payload.EventType, payload.Severity, payload.Message, "skipped", "resend api key not configured", "customer")
w.WriteHeader(http.StatusOK)
w.Write([]byte(`{"status":"ok","sent":false,"reason":"no_api_key"}`))
return
@@ -310,13 +446,13 @@ func (h *Handler) handleNotify(w http.ResponseWriter, r *http.Request) {
sendErr := h.sendResendEmail(prefs.Email, subject, emailBody)
if sendErr != nil {
h.logger.Printf("[ERROR] Failed to send notification email to %s: %v", prefs.Email, sendErr)
h.store.LogNotification(payload.CustomerID, payload.EventType, payload.Severity, payload.Message, "failed", sendErr.Error())
h.store.LogNotification(payload.CustomerID, payload.EventType, payload.Severity, payload.Message, "failed", sendErr.Error(), "customer")
http.Error(w, "Failed to send email", http.StatusInternalServerError)
return
}
h.logger.Printf("[INFO] Notification email sent to %s for %s/%s", prefs.Email, payload.CustomerID, payload.EventType)
h.store.LogNotification(payload.CustomerID, payload.EventType, payload.Severity, payload.Message, "sent", "")
h.store.LogNotification(payload.CustomerID, payload.EventType, payload.Severity, payload.Message, "sent", "", "customer")
w.WriteHeader(http.StatusOK)
w.Write([]byte(`{"status":"ok","sent":true}`))
@@ -339,13 +475,14 @@ func (h *Handler) handleSavePreferences(w http.ResponseWriter, r *http.Request)
CustomerID string `json:"customer_id"`
Email string `json:"email"`
EnabledEvents []string `json:"enabled_events"`
CooldownHours int `json:"cooldown_hours"`
}
if err := json.Unmarshal(body, &payload); err != nil || payload.CustomerID == "" {
http.Error(w, "Invalid payload: customer_id required", http.StatusBadRequest)
return
}
if err := h.store.SaveNotificationPrefs(payload.CustomerID, payload.Email, payload.EnabledEvents); err != nil {
if err := h.store.SaveNotificationPrefs(payload.CustomerID, payload.Email, payload.EnabledEvents, payload.CooldownHours); err != nil {
h.logger.Printf("[ERROR] Failed to save notification prefs for %s: %v", payload.CustomerID, err)
http.Error(w, "Internal error", http.StatusInternalServerError)
return
@@ -503,6 +640,7 @@ func formatNotificationEmail(customerID, eventType, severity, message, details s
severityLabel := map[string]string{
"info": "Információ",
"warning": "Figyelmeztetés",
"error": "Hiba",
"critical": "Kritikus",
}
label := severityLabel[severity]