feat: Hub monitoring takeover — event system, dead man's switch, notifications (v0.3.0)

Replace external Healthchecks.io with Hub-native monitoring. New events
table + /api/v1/event endpoint for structured events from controllers.
Staleness checker (60s) detects unresponsive nodes. Backup deadline
checker (daily 05:00) catches missed backups. Notification dispatcher
sends operator (English) + customer (Hungarian) emails via Resend with
per-event cooldowns. Event timeline on customer page, dashboard badges.
Config form deprecates Monitoring UUIDs section.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-20 18:53:24 +01:00
parent b4cb92e09f
commit 3217cb4751
16 changed files with 1319 additions and 64 deletions
+201
View File
@@ -0,0 +1,201 @@
package notify
import (
"bytes"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"sync"
"time"
"gitea.dooplex.hu/admin/felhom-hub/internal/store"
)
// Dispatcher routes events to operator and/or customer email channels.
// Cooldowns are in-memory (lost on restart, acceptable).
type Dispatcher struct {
store *store.Store
resendAPIKey string
fromEmail string
operatorEmail string
operatorOn bool
httpClient *http.Client
logger *log.Logger
mu sync.Mutex
opCooldowns map[string]time.Time // "customerID:eventType" → last operator notify
custCooldowns map[string]time.Time // "customerID:eventType" → last customer notify
}
// NewDispatcher creates a new notification dispatcher.
func NewDispatcher(s *store.Store, resendAPIKey, fromEmail, operatorEmail string, operatorOn bool, logger *log.Logger) *Dispatcher {
return &Dispatcher{
store: s,
resendAPIKey: resendAPIKey,
fromEmail: fromEmail,
operatorEmail: operatorEmail,
operatorOn: operatorOn,
httpClient: &http.Client{Timeout: 10 * time.Second},
logger: logger,
opCooldowns: make(map[string]time.Time),
custCooldowns: make(map[string]time.Time),
}
}
// ProcessEvent evaluates an event and sends notifications as appropriate.
// Safe to call from goroutines.
func (d *Dispatcher) ProcessEvent(customerID, eventType, severity, message, detailsJSON, source string) {
if d.resendAPIKey == "" {
return
}
// "test" bypass — send directly to customer email, skip prefs/cooldown
if eventType == "test" {
d.sendTestEmail(customerID)
return
}
// Only warning and error severity trigger notifications
if severity != "warning" && severity != "error" {
return
}
// Operator channel
d.processOperator(customerID, eventType, severity, message, detailsJSON, source)
// Customer channel
d.processCustomer(customerID, eventType, severity, message, detailsJSON, source)
}
func (d *Dispatcher) sendTestEmail(customerID string) {
prefs, err := d.store.GetNotificationPrefs(customerID)
if err != nil || prefs.Email == "" {
d.logger.Printf("[WARN] Test email: no email configured for %s", customerID)
return
}
subject := "[Felhom] Teszt értesítés"
body := "Kedves Ügyfél!\n\nEz egy teszt értesítés a Felhom monitoring rendszerből.\nAz értesítések megfelelően működnek.\n\nÜdvözlettel,\nFelhom.eu monitoring"
if err := d.sendEmail(prefs.Email, subject, body); err != nil {
d.logger.Printf("[ERROR] Test email to %s failed: %v", prefs.Email, err)
d.store.LogNotification(customerID, "test", "info", "Teszt értesítés", "failed", err.Error(), "customer")
return
}
d.logger.Printf("[INFO] Test email sent to %s for %s", prefs.Email, customerID)
d.store.LogNotification(customerID, "test", "info", "Teszt értesítés", "sent", "", "customer")
}
func (d *Dispatcher) processOperator(customerID, eventType, severity, message, detailsJSON, source string) {
if !d.operatorOn || d.operatorEmail == "" {
return
}
cooldownKey := customerID + ":" + eventType
d.mu.Lock()
if last, ok := d.opCooldowns[cooldownKey]; ok && time.Since(last) < 1*time.Hour {
d.mu.Unlock()
return
}
d.opCooldowns[cooldownKey] = time.Now()
d.mu.Unlock()
subject, body := FormatOperatorEmail(customerID, eventType, severity, message, detailsJSON)
if err := d.sendEmail(d.operatorEmail, subject, body); err != nil {
d.logger.Printf("[ERROR] Operator email failed for %s/%s: %v", customerID, eventType, err)
d.store.LogNotification(customerID, eventType, severity, message, "failed", err.Error(), "operator")
return
}
d.logger.Printf("[INFO] Operator email sent for %s/%s", customerID, eventType)
d.store.LogNotification(customerID, eventType, severity, message, "sent", "", "operator")
}
func (d *Dispatcher) processCustomer(customerID, eventType, severity, message, detailsJSON, source string) {
// Check if customer is blocked
if d.store.IsCustomerBlocked(customerID) {
return
}
// Load preferences
prefs, err := d.store.GetNotificationPrefs(customerID)
if err != nil || prefs.Email == "" {
return
}
// Check if event type is enabled
if !isEventEnabled(prefs.EnabledEvents, eventType) {
return
}
// Customer cooldown (from prefs, default 6h)
cooldownHours := prefs.CooldownHours
if cooldownHours <= 0 {
cooldownHours = 6
}
cooldownDur := time.Duration(cooldownHours) * time.Hour
cooldownKey := customerID + ":" + eventType
d.mu.Lock()
if last, ok := d.custCooldowns[cooldownKey]; ok && time.Since(last) < cooldownDur {
d.mu.Unlock()
return
}
d.custCooldowns[cooldownKey] = time.Now()
d.mu.Unlock()
subject, body := FormatCustomerEmail(customerID, eventType, severity, message, detailsJSON)
if err := d.sendEmail(prefs.Email, subject, body); err != nil {
d.logger.Printf("[ERROR] Customer email failed for %s/%s: %v", customerID, eventType, err)
d.store.LogNotification(customerID, eventType, severity, message, "failed", err.Error(), "customer")
return
}
d.logger.Printf("[INFO] Customer email sent to %s for %s/%s", prefs.Email, customerID, eventType)
d.store.LogNotification(customerID, eventType, severity, message, "sent", "", "customer")
}
func (d *Dispatcher) sendEmail(to, subject, textBody string) error {
payload := map[string]interface{}{
"from": d.fromEmail,
"to": []string{to},
"subject": subject,
"text": textBody,
}
jsonData, err := json.Marshal(payload)
if err != nil {
return fmt.Errorf("marshaling email payload: %w", err)
}
req, err := http.NewRequest("POST", "https://api.resend.com/emails", bytes.NewReader(jsonData))
if err != nil {
return fmt.Errorf("creating request: %w", err)
}
req.Header.Set("Authorization", "Bearer "+d.resendAPIKey)
req.Header.Set("Content-Type", "application/json")
resp, err := d.httpClient.Do(req)
if err != nil {
return fmt.Errorf("sending request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= 400 {
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
return fmt.Errorf("resend API returned %d: %s", resp.StatusCode, string(respBody))
}
return nil
}
func isEventEnabled(enabledEvents []string, eventType string) bool {
for _, e := range enabledEvents {
if e == eventType {
return true
}
}
return false
}
+153
View File
@@ -0,0 +1,153 @@
package notify
import (
"fmt"
"time"
)
// budapest timezone for formatting.
var budapest *time.Location
func init() {
var err error
budapest, err = time.LoadLocation("Europe/Budapest")
if err != nil {
budapest = time.FixedZone("CET", 3600)
}
}
// ──────────────────────────────────────────────────────────────────────
// Operator email — concise, English
// ──────────────────────────────────────────────────────────────────────
// FormatOperatorEmail returns (subject, textBody) for the operator channel.
func FormatOperatorEmail(customerID, eventType, severity, message, detailsJSON string) (string, string) {
icon := "⚠️"
if severity == "error" {
icon = "🔴"
}
subject := fmt.Sprintf("[Felhom] %s %s: %s", icon, customerID, eventType)
now := time.Now().In(budapest).Format("2006-01-02 15:04 MST")
body := fmt.Sprintf(`Customer: %s
Event: %s
Severity: %s
Time: %s
Message: %s`, customerID, eventType, severity, now, message)
if detailsJSON != "" && detailsJSON != "{}" {
body += fmt.Sprintf("\nDetails: %s", detailsJSON)
}
body += fmt.Sprintf("\n\nDashboard: https://hub.felhom.eu/customers/%s", customerID)
return subject, body
}
// ──────────────────────────────────────────────────────────────────────
// Customer email — Hungarian, friendly
// ──────────────────────────────────────────────────────────────────────
// customerMessages maps event_type → Hungarian customer message.
var customerMessages = map[string]string{
// Backup events
"backup_completed": "A biztonsági mentés sikeresen elkészült.",
"backup_failed": "A biztonsági mentés sikertelen! Kérjük, ellenőrizd a rendszert.",
"db_dump_completed": "Az adatbázis mentés sikeresen elkészült.",
"db_dump_failed": "Az adatbázis mentés sikertelen!",
"backup_integrity_ok": "A mentés integritás ellenőrzés sikeres.",
"backup_integrity_failed": "A mentés integritás ellenőrzés hibát talált!",
"crossdrive_completed": "A másodlagos mentés sikeresen elkészült.",
"crossdrive_failed": "A másodlagos mentés sikertelen!",
// Disk events
"disk_warning": "A lemezterület 90% felett van — kérjük, szabadíts fel helyet.",
"disk_critical": "A lemezterület kritikusan magas (95%+) — azonnali beavatkozás szükséges!",
// Storage events
"storage_disconnected": "Egy meghajtó leválasztva — a mentések szünetelhetnek.",
"storage_reconnected": "A meghajtó újra csatlakoztatva.",
// Staleness events (Hub-generated)
"node_stale": "A szerver nem küldött jelentést az elmúlt időszakban.",
"node_down": "A szerver nem elérhető!",
"node_recovered": "A szerver újra elérhető.",
// Health events
"health_degraded": "A rendszer állapota romlott.",
"health_critical": "A rendszer állapota kritikus!",
"health_recovered": "A rendszer állapota helyreállt.",
// Controller events
"controller_started": "A vezérlő elindult.",
"controller_updated": "A vezérlő frissítve lett.",
// Deadline events (Hub-generated)
"expected_backup_missed": "A mai biztonsági mentés nem készült el a határidőig!",
"expected_dbdump_missed": "A mai adatbázis mentés nem készült el a határidőig!",
// App lifecycle events
"app_deployed": "Alkalmazás telepítve.",
"app_removed": "Alkalmazás eltávolítva.",
// Disaster recovery events
"disaster_recovery_started": "Katasztrófa helyreállítás elindítva.",
"disaster_recovery_completed": "Katasztrófa helyreállítás befejezve.",
// Test
"test": "Ez egy teszt értesítés.",
}
// severityLabels maps severity to Hungarian labels.
var severityLabels = map[string]string{
"info": "Információ",
"warning": "Figyelmeztetés",
"error": "Hiba",
}
// FormatCustomerEmail returns (subject, textBody) for the customer channel.
func FormatCustomerEmail(customerID, eventType, severity, message, detailsJSON string) (string, string) {
label := severityLabels[severity]
if label == "" {
label = severity
}
// Use the per-event-type Hungarian message if available, otherwise fall back to message
hunMessage := customerMessages[eventType]
if hunMessage == "" {
hunMessage = message
}
subject := fmt.Sprintf("[Felhom] %s: %s", label, hunMessage)
now := time.Now().In(budapest).Format("2006-01-02 15:04")
body := fmt.Sprintf(`Kedves Ügyfél!
A Felhom rendszered a következő értesítést küldte:
%s
Részletek:
- Szerver: %s
- Időpont: %s
- Szint: %s
- Típus: %s`, hunMessage, customerID, now, label, eventType)
if message != "" && message != hunMessage {
body += fmt.Sprintf("\n- Üzenet: %s", message)
}
if detailsJSON != "" && detailsJSON != "{}" {
body += fmt.Sprintf("\n- Megjegyzés: %s", detailsJSON)
}
body += `
Ha kérdésed van, vedd fel a kapcsolatot az üzemeltetővel.
Üdvözlettel,
Felhom.eu monitoring`
return subject, body
}