Phase 2: monitoring warnings, dashboard alerts & notification system
- Monitoring page: "Távoli monitoring" section showing healthcheck ping UUID configuration status (configured/not configured) for each of the 5 pings - Alert manager: persistent dashboard banners on all pages generated from health check results, missing pings, and backup status - Notification system: controller-side notifier sends events to hub relay, with cooldown tracking and event-type filtering - Notification preferences UI: email, event checkboxes, cooldown settings on the settings page with test email functionality - Settings refactored: shared settingsData() helper, NotificationPrefs struct with getter/setter and defaults New files: - controller/internal/web/alerts.go (AlertManager) - controller/internal/notify/notifier.go (hub notification client) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,272 @@
|
||||
package notify
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/settings"
|
||||
)
|
||||
|
||||
// Notifier sends notification events to the hub relay service.
|
||||
// Non-blocking: fires requests in goroutines, logs errors but doesn't retry aggressively.
|
||||
type Notifier struct {
|
||||
hubURL string
|
||||
apiKey string
|
||||
customerID string
|
||||
httpClient *http.Client
|
||||
logger *log.Logger
|
||||
enabled bool
|
||||
settings *settings.Settings
|
||||
|
||||
mu sync.Mutex
|
||||
cooldowns map[string]time.Time // event_type -> last notification time
|
||||
|
||||
// prevHealthStatus tracks the previous health check status for change detection
|
||||
prevHealthStatus string
|
||||
}
|
||||
|
||||
// New creates a new Notifier. Returns a no-op notifier if hub is not enabled.
|
||||
func New(hubURL, apiKey, customerID string, sett *settings.Settings, logger *log.Logger) *Notifier {
|
||||
enabled := hubURL != "" && apiKey != ""
|
||||
if enabled {
|
||||
logger.Printf("[INFO] Notifier enabled (hub: %s)", hubURL)
|
||||
} else {
|
||||
logger.Printf("[INFO] Notifier disabled (hub not configured)")
|
||||
}
|
||||
|
||||
return &Notifier{
|
||||
hubURL: hubURL,
|
||||
apiKey: apiKey,
|
||||
customerID: customerID,
|
||||
httpClient: &http.Client{Timeout: 10 * time.Second},
|
||||
logger: logger,
|
||||
enabled: enabled,
|
||||
settings: sett,
|
||||
cooldowns: make(map[string]time.Time),
|
||||
}
|
||||
}
|
||||
|
||||
// notifyRequest is the JSON payload sent to the hub.
|
||||
type notifyRequest struct {
|
||||
CustomerID string `json:"customer_id"`
|
||||
EventType string `json:"event_type"`
|
||||
Severity string `json:"severity"`
|
||||
Message string `json:"message"`
|
||||
Details string `json:"details,omitempty"`
|
||||
}
|
||||
|
||||
// Notify sends a notification event to the hub relay.
|
||||
// Checks local cooldown and event preferences before sending.
|
||||
// Non-blocking: fires the HTTP request in a goroutine.
|
||||
func (n *Notifier) Notify(eventType, severity, message, details string) {
|
||||
if !n.enabled {
|
||||
return
|
||||
}
|
||||
|
||||
prefs := n.settings.GetNotificationPrefs()
|
||||
if prefs.Email == "" {
|
||||
return // No email configured, skip
|
||||
}
|
||||
|
||||
// Check if event is enabled in preferences
|
||||
eventEnabled := false
|
||||
for _, e := range prefs.EnabledEvents {
|
||||
if e == eventType {
|
||||
eventEnabled = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !eventEnabled {
|
||||
return
|
||||
}
|
||||
|
||||
// Check cooldown
|
||||
cooldownDuration := time.Duration(prefs.CooldownHours) * time.Hour
|
||||
if cooldownDuration == 0 {
|
||||
cooldownDuration = 6 * time.Hour
|
||||
}
|
||||
|
||||
n.mu.Lock()
|
||||
lastSent, exists := n.cooldowns[eventType]
|
||||
if exists && time.Since(lastSent) < cooldownDuration {
|
||||
n.mu.Unlock()
|
||||
n.logger.Printf("[DEBUG] Notification cooldown active for %s (sent %s ago)", eventType, time.Since(lastSent).Round(time.Minute))
|
||||
return
|
||||
}
|
||||
n.cooldowns[eventType] = time.Now()
|
||||
n.mu.Unlock()
|
||||
|
||||
// Fire the notification in a goroutine (non-blocking)
|
||||
go func() {
|
||||
payload := notifyRequest{
|
||||
CustomerID: n.customerID,
|
||||
EventType: eventType,
|
||||
Severity: severity,
|
||||
Message: message,
|
||||
Details: details,
|
||||
}
|
||||
|
||||
jsonData, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
n.logger.Printf("[ERROR] Failed to marshal notification: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
url := n.hubURL + "/api/v1/notify"
|
||||
req, err := http.NewRequest("POST", url, bytes.NewReader(jsonData))
|
||||
if err != nil {
|
||||
n.logger.Printf("[ERROR] Failed to create notification request: %v", err)
|
||||
return
|
||||
}
|
||||
req.Header.Set("Authorization", "Bearer "+n.apiKey)
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := n.httpClient.Do(req)
|
||||
if err != nil {
|
||||
n.logger.Printf("[WARN] Failed to send notification to hub: %v", err)
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 400 {
|
||||
n.logger.Printf("[WARN] Hub notification returned %d for %s/%s", resp.StatusCode, eventType, severity)
|
||||
return
|
||||
}
|
||||
|
||||
n.logger.Printf("[INFO] Notification sent: %s (%s) — %s", eventType, severity, message)
|
||||
}()
|
||||
}
|
||||
|
||||
// NotifyHealthChange checks if health status changed and sends appropriate notifications.
|
||||
// Call this after each health check with the new report status/issues/warnings.
|
||||
func (n *Notifier) NotifyHealthChange(status string, issues, warnings []string) {
|
||||
if !n.enabled {
|
||||
return
|
||||
}
|
||||
|
||||
prev := n.prevHealthStatus
|
||||
n.prevHealthStatus = status
|
||||
|
||||
// Only notify on status degradation (ok→warn, ok→fail, warn→fail)
|
||||
if prev == "" {
|
||||
return // First run, just record status
|
||||
}
|
||||
if statusRank(status) <= statusRank(prev) {
|
||||
return // Status improved or stayed the same
|
||||
}
|
||||
|
||||
// Notify about each issue/warning
|
||||
for _, issue := range issues {
|
||||
n.Notify("container_unhealthy", "critical", issue, "")
|
||||
}
|
||||
for _, w := range warnings {
|
||||
// Determine specific event type from warning message
|
||||
eventType := classifyWarning(w)
|
||||
n.Notify(eventType, "warning", w, "")
|
||||
}
|
||||
}
|
||||
|
||||
// NotifyBackupFailed sends a notification about a backup failure.
|
||||
func (n *Notifier) NotifyBackupFailed(message, details string) {
|
||||
n.Notify("backup_failed", "critical", message, details)
|
||||
}
|
||||
|
||||
// NotifyDBDumpFailed sends a notification about a database dump failure.
|
||||
func (n *Notifier) NotifyDBDumpFailed(message, details string) {
|
||||
n.Notify("db_dump_failed", "critical", message, details)
|
||||
}
|
||||
|
||||
// NotifyIntegrityFailed sends a notification about a backup integrity check failure.
|
||||
func (n *Notifier) NotifyIntegrityFailed(message, details string) {
|
||||
n.Notify("integrity_failed", "warning", message, details)
|
||||
}
|
||||
|
||||
// SendTest sends a test notification for verifying the notification flow.
|
||||
func (n *Notifier) SendTest() error {
|
||||
if !n.enabled {
|
||||
return fmt.Errorf("notifications not enabled (hub not configured)")
|
||||
}
|
||||
|
||||
payload := notifyRequest{
|
||||
CustomerID: n.customerID,
|
||||
EventType: "test",
|
||||
Severity: "info",
|
||||
Message: "Teszt értesítés a Felhom rendszerből",
|
||||
Details: "Ha ezt az emailt megkapta, az értesítések megfelelően működnek.",
|
||||
}
|
||||
|
||||
jsonData, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal: %w", err)
|
||||
}
|
||||
|
||||
url := n.hubURL + "/api/v1/notify"
|
||||
req, err := http.NewRequest("POST", url, bytes.NewReader(jsonData))
|
||||
if err != nil {
|
||||
return fmt.Errorf("request: %w", err)
|
||||
}
|
||||
req.Header.Set("Authorization", "Bearer "+n.apiKey)
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := n.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("send: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 400 {
|
||||
return fmt.Errorf("hub returned %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func statusRank(status string) int {
|
||||
switch status {
|
||||
case "ok":
|
||||
return 0
|
||||
case "warn":
|
||||
return 1
|
||||
case "fail":
|
||||
return 2
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func classifyWarning(message string) string {
|
||||
// Try to classify the warning message into a specific event type
|
||||
switch {
|
||||
case contains(message, "disk") || contains(message, "Disk") || contains(message, "SSD") || contains(message, "HDD"):
|
||||
if contains(message, "critical") || contains(message, "Critical") {
|
||||
return "disk_critical"
|
||||
}
|
||||
return "disk_warning"
|
||||
case contains(message, "Memory") || contains(message, "memory"):
|
||||
return "disk_warning" // group memory under system warnings
|
||||
case contains(message, "Temperature") || contains(message, "temperature"):
|
||||
return "disk_warning" // group temp under system warnings
|
||||
case contains(message, "container") || contains(message, "Container"):
|
||||
return "container_unhealthy"
|
||||
default:
|
||||
return "disk_warning" // fallback to generic system warning
|
||||
}
|
||||
}
|
||||
|
||||
func contains(s, substr string) bool {
|
||||
return len(s) >= len(substr) && (s == substr || len(s) > 0 && containsBytes(s, substr))
|
||||
}
|
||||
|
||||
func containsBytes(s, substr string) bool {
|
||||
for i := 0; i <= len(s)-len(substr); i++ {
|
||||
if s[i:i+len(substr)] == substr {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
Reference in New Issue
Block a user