feat: Hub monitoring takeover — event push system + config cleanup (v0.21.0)
Replace external Healthchecks.io with Hub-native event system. Controller now pushes structured events via POST /api/v1/event with typed detail structs. Hub handles dead man's switch, notification dispatch, and cooldowns. Phase 5: PushEvent() core method, 21 event types, expanded notification settings (11 toggles), Hub connection monitoring on dashboard, alerts. Phase 6: Deprecation log for ping UUIDs, pinger kept for transition. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,7 @@ import (
|
||||
"log"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/backup"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||||
@@ -27,9 +28,10 @@ type Alert struct {
|
||||
// Alerts are state-based (not event-based) — they reflect current system state
|
||||
// and are regenerated after each health check cycle.
|
||||
type AlertManager struct {
|
||||
mu sync.RWMutex
|
||||
alerts []Alert
|
||||
logger *log.Logger
|
||||
mu sync.RWMutex
|
||||
alerts []Alert
|
||||
logger *log.Logger
|
||||
hubPushStatusFn func() HubPushStatusData
|
||||
}
|
||||
|
||||
// NewAlertManager creates a new AlertManager.
|
||||
@@ -39,6 +41,13 @@ func NewAlertManager(logger *log.Logger) *AlertManager {
|
||||
}
|
||||
}
|
||||
|
||||
// SetHubPushStatus sets the hub push status callback for generating hub alerts.
|
||||
func (am *AlertManager) SetHubPushStatus(fn func() HubPushStatusData) {
|
||||
am.mu.Lock()
|
||||
am.hubPushStatusFn = fn
|
||||
am.mu.Unlock()
|
||||
}
|
||||
|
||||
// Refresh regenerates alerts from the latest health check report and config state.
|
||||
// Called after each health check cycle (every 5 minutes) and on storage state changes.
|
||||
func (am *AlertManager) Refresh(report *monitor.HealthReport, cfg *config.Config, backupMgr *backup.Manager, updateAvailable bool, latestVersion string, storagePaths ...[]settings.StoragePath) {
|
||||
@@ -92,14 +101,22 @@ func (am *AlertManager) Refresh(report *monitor.HealthReport, cfg *config.Config
|
||||
alerts = append(alerts, alert)
|
||||
}
|
||||
|
||||
// Missing ping UUIDs
|
||||
if cfg.Monitoring.Enabled {
|
||||
missing := countMissingPings(cfg)
|
||||
if missing > 0 {
|
||||
// Hub connection status
|
||||
if !cfg.Hub.Enabled || cfg.Hub.URL == "" {
|
||||
alerts = append(alerts, Alert{
|
||||
ID: "hub-disabled",
|
||||
Level: "warning",
|
||||
Message: "Hub kapcsolat kikapcsolva — a központi monitoring nem aktív",
|
||||
Link: "/monitoring",
|
||||
LinkText: "Rendszermonitor",
|
||||
})
|
||||
} else if am.hubPushStatusFn != nil {
|
||||
ps := am.hubPushStatusFn()
|
||||
if ps.LastError != "" && (ps.LastSuccess.IsZero() || time.Since(ps.LastSuccess) > 30*time.Minute) {
|
||||
alerts = append(alerts, Alert{
|
||||
ID: "pings-missing",
|
||||
Level: "warning",
|
||||
Message: fmt.Sprintf("%d monitoring ellenőrzés nincs beállítva", missing),
|
||||
ID: "hub-unreachable",
|
||||
Level: "error",
|
||||
Message: fmt.Sprintf("Hub nem elérhető — utolsó hiba: %s", ps.LastError),
|
||||
Link: "/monitoring",
|
||||
LinkText: "Rendszermonitor",
|
||||
})
|
||||
@@ -200,24 +217,6 @@ func (am *AlertManager) GetInlineAlerts(page string) []Alert {
|
||||
return result
|
||||
}
|
||||
|
||||
// countMissingPings counts how many ping UUIDs are not configured.
|
||||
func countMissingPings(cfg *config.Config) int {
|
||||
count := 0
|
||||
uuids := []string{
|
||||
cfg.Monitoring.PingUUIDs.Heartbeat,
|
||||
cfg.Monitoring.PingUUIDs.SystemHealth,
|
||||
cfg.Monitoring.PingUUIDs.DBDump,
|
||||
cfg.Monitoring.PingUUIDs.Backup,
|
||||
cfg.Monitoring.PingUUIDs.BackupIntegrity,
|
||||
}
|
||||
for _, uuid := range uuids {
|
||||
if !isPingConfigured(uuid) {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
// simpleHash returns a short deterministic hash for deduplication.
|
||||
func simpleHash(s string) string {
|
||||
h := uint32(0)
|
||||
|
||||
@@ -110,6 +110,18 @@ func (s *Server) executeAllRestores() {
|
||||
return
|
||||
}
|
||||
|
||||
// Count pending apps and push DR start event
|
||||
pendingCount := 0
|
||||
for _, app := range plan.Apps {
|
||||
if app.Status == "pending" {
|
||||
pendingCount++
|
||||
}
|
||||
}
|
||||
if s.notifier != nil {
|
||||
s.notifier.NotifyDRStarted(pendingCount)
|
||||
}
|
||||
|
||||
successCount, failCount := 0, 0
|
||||
for i := range plan.Apps {
|
||||
app := &plan.Apps[i]
|
||||
if app.Status != "pending" {
|
||||
@@ -126,15 +138,22 @@ func (s *Server) executeAllRestores() {
|
||||
if err != nil {
|
||||
plan.UpdateApp(app.Name, "failed", err.Error())
|
||||
s.logger.Printf("[ERROR] Restore failed for %s: %v", app.Name, err)
|
||||
failCount++
|
||||
} else {
|
||||
plan.UpdateApp(app.Name, "done", "")
|
||||
s.logger.Printf("[INFO] Restore completed for %s", app.Name)
|
||||
successCount++
|
||||
}
|
||||
}
|
||||
|
||||
plan.SetStatus("done")
|
||||
s.logger.Println("[INFO] All app restores completed")
|
||||
|
||||
// Push DR completion event
|
||||
if s.notifier != nil {
|
||||
s.notifier.NotifyDRCompleted(successCount, failCount)
|
||||
}
|
||||
|
||||
// Re-scan stacks so dashboard picks up restored apps
|
||||
if s.stackMgr != nil {
|
||||
if err := s.stackMgr.ScanStacks(); err != nil {
|
||||
|
||||
@@ -411,21 +411,36 @@ func (s *Server) monitoringHandler(w http.ResponseWriter, _ *http.Request) {
|
||||
data["SystemInfo"] = system.GetInfo(s.primaryHDDPath(), s.cpuCollector)
|
||||
data["StorageBars"] = s.buildStorageBars()
|
||||
|
||||
// On monitoring page, exclude the "pings-missing" alert since the detailed table is visible
|
||||
if s.alertManager != nil {
|
||||
data["Alerts"] = s.alertManager.GetAlerts("pings-missing")
|
||||
data["Alerts"] = s.alertManager.GetAlerts()
|
||||
data["DiskWarnings"] = s.alertManager.GetInlineAlerts("monitoring")
|
||||
}
|
||||
|
||||
// Ping status section
|
||||
// Hub connection status section
|
||||
data["HubEnabled"] = s.cfg.Hub.Enabled && s.cfg.Hub.URL != ""
|
||||
data["HubURL"] = s.cfg.Hub.URL
|
||||
data["CustomerID"] = s.cfg.Customer.ID
|
||||
|
||||
if s.hubPushStatusFn != nil {
|
||||
ps := s.hubPushStatusFn()
|
||||
data["HubLastAttempt"] = ps.LastAttempt
|
||||
data["HubLastSuccess"] = ps.LastSuccess
|
||||
data["HubLastError"] = ps.LastError
|
||||
data["HubConsecutiveFailures"] = ps.Consecutive
|
||||
// Connected if last success was within 2x the push interval (or 30min default)
|
||||
connected := !ps.LastSuccess.IsZero() && time.Since(ps.LastSuccess) < 30*time.Minute
|
||||
data["HubConnected"] = connected
|
||||
}
|
||||
|
||||
// Legacy ping status section (still shown for backward compat during transition)
|
||||
data["MonitoringEnabled"] = s.cfg.Monitoring.Enabled
|
||||
if s.cfg.Monitoring.Enabled {
|
||||
pings := []map[string]interface{}{
|
||||
{"Label": "Életjel (Heartbeat)", "Icon": "💓", "Configured": isPingConfigured(s.cfg.Monitoring.PingUUIDs.Heartbeat), "Schedule": "5 percenként"},
|
||||
{"Label": "Rendszer állapot", "Icon": "🖥️", "Configured": isPingConfigured(s.cfg.Monitoring.PingUUIDs.SystemHealth), "Schedule": "5 percenként"},
|
||||
{"Label": "Adatbázis mentés", "Icon": "🗄️", "Configured": isPingConfigured(s.cfg.Monitoring.PingUUIDs.DBDump), "Schedule": "Naponta " + s.cfg.Backup.DBDumpSchedule},
|
||||
{"Label": "Biztonsági mentés", "Icon": "💾", "Configured": isPingConfigured(s.cfg.Monitoring.PingUUIDs.Backup), "Schedule": "Naponta " + s.cfg.Backup.ResticSchedule},
|
||||
{"Label": "Mentés integritás", "Icon": "🔍", "Configured": isPingConfigured(s.cfg.Monitoring.PingUUIDs.BackupIntegrity), "Schedule": "Hetente (vasárnap)"},
|
||||
{"Label": "Eletjel (Heartbeat)", "Icon": "heartbeat", "Configured": isPingConfigured(s.cfg.Monitoring.PingUUIDs.Heartbeat), "Schedule": "5 percenkent"},
|
||||
{"Label": "Rendszer allapot", "Icon": "system", "Configured": isPingConfigured(s.cfg.Monitoring.PingUUIDs.SystemHealth), "Schedule": "5 percenkent"},
|
||||
{"Label": "Adatbazis mentes", "Icon": "db", "Configured": isPingConfigured(s.cfg.Monitoring.PingUUIDs.DBDump), "Schedule": "Naponta " + s.cfg.Backup.DBDumpSchedule},
|
||||
{"Label": "Biztonsagi mentes", "Icon": "backup", "Configured": isPingConfigured(s.cfg.Monitoring.PingUUIDs.Backup), "Schedule": "Naponta " + s.cfg.Backup.ResticSchedule},
|
||||
{"Label": "Mentes integritas", "Icon": "integrity", "Configured": isPingConfigured(s.cfg.Monitoring.PingUUIDs.BackupIntegrity), "Schedule": "Hetente (vasarnap)"},
|
||||
}
|
||||
allConfigured := true
|
||||
for _, p := range pings {
|
||||
@@ -1076,11 +1091,24 @@ func (s *Server) settingsNotificationsHandler(w http.ResponseWriter, r *http.Req
|
||||
|
||||
// Collect enabled events from checkboxes
|
||||
var enabledEvents []string
|
||||
for _, evt := range []string{"disk_warning", "backup_failed", "update_available", "security_update"} {
|
||||
// Single-event checkboxes
|
||||
for _, evt := range []string{
|
||||
"backup_failed", "db_dump_failed", "backup_integrity_failed",
|
||||
"crossdrive_failed", "storage_disconnected",
|
||||
"node_down", "health_critical",
|
||||
"storage_reconnected", "health_recovered",
|
||||
} {
|
||||
if r.FormValue("event_"+evt) == "on" {
|
||||
enabledEvents = append(enabledEvents, evt)
|
||||
}
|
||||
}
|
||||
// Compound toggles: one checkbox → two event types
|
||||
if r.FormValue("event_disk_alerts") == "on" {
|
||||
enabledEvents = append(enabledEvents, "disk_warning", "disk_critical")
|
||||
}
|
||||
if r.FormValue("event_expected_missed") == "on" {
|
||||
enabledEvents = append(enabledEvents, "expected_backup_missed", "expected_dbdump_missed")
|
||||
}
|
||||
|
||||
prefs := &settings.NotificationPrefs{
|
||||
Email: email,
|
||||
@@ -1101,7 +1129,7 @@ func (s *Server) settingsNotificationsHandler(w http.ResponseWriter, r *http.Req
|
||||
// Sync preferences to hub
|
||||
data := s.settingsData()
|
||||
if s.notifier != nil && s.notifier.IsEnabled() {
|
||||
if err := s.notifier.SyncPreferences(email, enabledEvents); err != nil {
|
||||
if err := s.notifier.SyncPreferences(email, enabledEvents, cooldownHours); err != nil {
|
||||
s.logger.Printf("[WARN] Failed to sync preferences to hub: %v", err)
|
||||
data["NotificationSuccess"] = fmt.Sprintf("Értesítési beállítások mentve (helyi). A központi szinkronizálás sikertelen: %v", err)
|
||||
} else {
|
||||
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/backup"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||||
@@ -57,6 +58,9 @@ type Server struct {
|
||||
|
||||
// Storage watchdog (set after construction to break init ordering)
|
||||
storageWatchdog *monitor.StorageWatchdog
|
||||
|
||||
// Hub push status callback — set via SetHubPushStatus for monitoring page
|
||||
hubPushStatusFn func() HubPushStatusData
|
||||
}
|
||||
|
||||
func NewServer(cfg *config.Config, stackMgr *stacks.Manager, cpuCollector *system.CPUCollector, backupMgr *backup.Manager, crossDrive *backup.CrossDriveRunner, sched *scheduler.Scheduler, sett *settings.Settings, alertMgr *AlertManager, notif *notify.Notifier, updater *selfupdate.Updater, logger *log.Logger, version string) *Server {
|
||||
@@ -117,6 +121,19 @@ func (s *Server) SetDriveMigrator(dm *storage.DriveMigrator) {
|
||||
s.driveMigrator = dm
|
||||
}
|
||||
|
||||
// HubPushStatusData holds hub push status for the monitoring page.
|
||||
type HubPushStatusData struct {
|
||||
LastAttempt time.Time
|
||||
LastSuccess time.Time
|
||||
LastError string
|
||||
Consecutive int
|
||||
}
|
||||
|
||||
// SetHubPushStatus sets the hub push status callback for the monitoring page.
|
||||
func (s *Server) SetHubPushStatus(fn func() HubPushStatusData) {
|
||||
s.hubPushStatusFn = fn
|
||||
}
|
||||
|
||||
// InRestoreMode returns true if the server is in DR restore mode.
|
||||
func (s *Server) InRestoreMode() bool {
|
||||
s.restoreMu.RLock()
|
||||
|
||||
@@ -86,33 +86,44 @@
|
||||
{{end}}
|
||||
</div>
|
||||
|
||||
<!-- Section 2: Remote Monitoring Status -->
|
||||
<!-- Section 2: Hub Connection Status -->
|
||||
<div class="monitor-card">
|
||||
<h3>Távoli monitoring</h3>
|
||||
{{if not .MonitoringEnabled}}
|
||||
<div class="monitoring-banner monitoring-banner-red">
|
||||
⚠️ A távoli monitoring ki van kapcsolva. Az üzemeltető nem kap értesítést hibák esetén.
|
||||
</div>
|
||||
{{else}}
|
||||
{{if .AllPingsConfigured}}
|
||||
<h3>Hub kapcsolat</h3>
|
||||
{{if .HubEnabled}}
|
||||
{{if .HubConnected}}
|
||||
<div class="monitoring-banner monitoring-banner-green">
|
||||
✅ Minden távoli monitoring aktív — az üzemeltető értesítést kap hibák esetén.
|
||||
Kapcsolódva — a központi rendszer aktívan figyeli a szervert.
|
||||
</div>
|
||||
{{else}}
|
||||
<div class="monitoring-banner monitoring-banner-yellow">
|
||||
⚠️ Egyes monitoring ellenőrzések nincsenek beállítva. Kérd az üzemeltetőt a konfiguráláshoz.
|
||||
<div class="monitoring-banner monitoring-banner-red">
|
||||
Nem elérhető — a központi rendszer nem kapott friss jelentést.
|
||||
</div>
|
||||
{{end}}
|
||||
<div class="sysinfo-grid" style="margin-top: 0.75rem">
|
||||
{{range .PingStatus}}
|
||||
<div class="sysinfo-row">
|
||||
<span class="sysinfo-label">{{.Icon}} {{.Label}}</span>
|
||||
<span class="sysinfo-value">
|
||||
{{if .Configured}}<span class="ping-status-ok">✅ Beállítva</span>{{else}}<span class="ping-status-warn">⚠️ Nincs beállítva</span>{{end}}
|
||||
<span class="ping-schedule">{{.Schedule}}</span>
|
||||
</span>
|
||||
<span class="sysinfo-label">Hub URL</span>
|
||||
<span class="sysinfo-value"><code>{{.HubURL}}</code></span>
|
||||
</div>
|
||||
<div class="sysinfo-row">
|
||||
<span class="sysinfo-label">Ügyfél azonosító</span>
|
||||
<span class="sysinfo-value"><code>{{.CustomerID}}</code></span>
|
||||
</div>
|
||||
{{if not .HubLastSuccess.IsZero}}
|
||||
<div class="sysinfo-row">
|
||||
<span class="sysinfo-label">Utolsó sikeres jelentés</span>
|
||||
<span class="sysinfo-value">{{.HubLastSuccess | timeAgo}}</span>
|
||||
</div>
|
||||
{{end}}
|
||||
{{if .HubLastError}}
|
||||
<div class="sysinfo-row">
|
||||
<span class="sysinfo-label">Utolsó hiba</span>
|
||||
<span class="sysinfo-value"><span class="text-error">{{.HubLastError}}</span></span>
|
||||
</div>
|
||||
{{end}}
|
||||
</div>
|
||||
{{else}}
|
||||
<div class="monitoring-banner monitoring-banner-yellow">
|
||||
A Hub kapcsolat nincs bekapcsolva — a központi monitoring nem aktív.
|
||||
</div>
|
||||
{{end}}
|
||||
</div>
|
||||
|
||||
@@ -413,23 +413,56 @@ function pollUntilBack() {
|
||||
placeholder="pelda@email.hu" class="form-control">
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Az alábbi eseményekről kapjon értesítést:</label>
|
||||
<label>Hibák és figyelmeztetések:</label>
|
||||
<div class="checkbox-group">
|
||||
<label class="toggle">
|
||||
<input type="checkbox" name="event_disk_warning" {{with .NotificationPrefs}}{{range .EnabledEvents}}{{if eq . "disk_warning"}}checked{{end}}{{end}}{{end}}>
|
||||
<span class="toggle-label">Lemez figyelmeztetés (80%+)</span>
|
||||
</label>
|
||||
<label class="toggle">
|
||||
<input type="checkbox" name="event_backup_failed" {{with .NotificationPrefs}}{{range .EnabledEvents}}{{if eq . "backup_failed"}}checked{{end}}{{end}}{{end}}>
|
||||
<span class="toggle-label">Biztonsági mentés sikertelen</span>
|
||||
</label>
|
||||
<label class="toggle">
|
||||
<input type="checkbox" name="event_update_available" {{with .NotificationPrefs}}{{range .EnabledEvents}}{{if eq . "update_available"}}checked{{end}}{{end}}{{end}}>
|
||||
<span class="toggle-label">Frissítés elérhető</span>
|
||||
<input type="checkbox" name="event_db_dump_failed" {{with .NotificationPrefs}}{{range .EnabledEvents}}{{if eq . "db_dump_failed"}}checked{{end}}{{end}}{{end}}>
|
||||
<span class="toggle-label">Adatbázis mentés sikertelen</span>
|
||||
</label>
|
||||
<label class="toggle">
|
||||
<input type="checkbox" name="event_security_update" {{with .NotificationPrefs}}{{range .EnabledEvents}}{{if eq . "security_update"}}checked{{end}}{{end}}{{end}}>
|
||||
<span class="toggle-label">Biztonsági frissítés</span>
|
||||
<input type="checkbox" name="event_backup_integrity_failed" {{with .NotificationPrefs}}{{range .EnabledEvents}}{{if eq . "backup_integrity_failed"}}checked{{end}}{{end}}{{end}}>
|
||||
<span class="toggle-label">Mentés sérülés észlelve</span>
|
||||
</label>
|
||||
<label class="toggle">
|
||||
<input type="checkbox" name="event_crossdrive_failed" {{with .NotificationPrefs}}{{range .EnabledEvents}}{{if eq . "crossdrive_failed"}}checked{{end}}{{end}}{{end}}>
|
||||
<span class="toggle-label">Másodlagos mentés sikertelen</span>
|
||||
</label>
|
||||
<label class="toggle">
|
||||
<input type="checkbox" name="event_disk_alerts" {{with .NotificationPrefs}}{{range .EnabledEvents}}{{if eq . "disk_warning"}}checked{{end}}{{end}}{{end}}>
|
||||
<span class="toggle-label">Lemez figyelmeztetés (90%+)</span>
|
||||
</label>
|
||||
<label class="toggle">
|
||||
<input type="checkbox" name="event_storage_disconnected" {{with .NotificationPrefs}}{{range .EnabledEvents}}{{if eq . "storage_disconnected"}}checked{{end}}{{end}}{{end}}>
|
||||
<span class="toggle-label">Meghajtó leválasztva</span>
|
||||
</label>
|
||||
<label class="toggle">
|
||||
<input type="checkbox" name="event_node_down" {{with .NotificationPrefs}}{{range .EnabledEvents}}{{if eq . "node_down"}}checked{{end}}{{end}}{{end}}>
|
||||
<span class="toggle-label">Szerver nem elérhető</span>
|
||||
</label>
|
||||
<label class="toggle">
|
||||
<input type="checkbox" name="event_health_critical" {{with .NotificationPrefs}}{{range .EnabledEvents}}{{if eq . "health_critical"}}checked{{end}}{{end}}{{end}}>
|
||||
<span class="toggle-label">Rendszer állapot kritikus</span>
|
||||
</label>
|
||||
<label class="toggle">
|
||||
<input type="checkbox" name="event_expected_missed" {{with .NotificationPrefs}}{{range .EnabledEvents}}{{if eq . "expected_backup_missed"}}checked{{end}}{{end}}{{end}}>
|
||||
<span class="toggle-label">Elvárt mentés elmaradt</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Tájékoztató:</label>
|
||||
<div class="checkbox-group">
|
||||
<label class="toggle">
|
||||
<input type="checkbox" name="event_storage_reconnected" {{with .NotificationPrefs}}{{range .EnabledEvents}}{{if eq . "storage_reconnected"}}checked{{end}}{{end}}{{end}}>
|
||||
<span class="toggle-label">Meghajtó újra csatlakoztatva</span>
|
||||
</label>
|
||||
<label class="toggle">
|
||||
<input type="checkbox" name="event_health_recovered" {{with .NotificationPrefs}}{{range .EnabledEvents}}{{if eq . "health_recovered"}}checked{{end}}{{end}}{{end}}>
|
||||
<span class="toggle-label">Rendszer állapot helyreállt</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Reference in New Issue
Block a user