feat: Hub monitoring takeover — event system, dead man's switch, notifications (v0.3.0)

Replace external Healthchecks.io with Hub-native monitoring. New events
table + /api/v1/event endpoint for structured events from controllers.
Staleness checker (60s) detects unresponsive nodes. Backup deadline
checker (daily 05:00) catches missed backups. Notification dispatcher
sends operator (English) + customer (Hungarian) emails via Resend with
per-event cooldowns. Event timeline on customer page, dashboard badges.
Config form deprecates Monitoring UUIDs section.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-20 18:53:24 +01:00
parent b4cb92e09f
commit 3217cb4751
16 changed files with 1319 additions and 64 deletions
+12 -2
View File
@@ -223,12 +223,14 @@ func (s *Server) handleCustomerUnified(w http.ResponseWriter, r *http.Request, c
}
}
// History, notifications, infra backup
// History, notifications, events, infra backup
var history []store.CustomerSummary
var notifPrefs *store.NotificationPrefs
var recentNotifs []store.NotificationLogEntry
var infraMeta *store.InfraBackupMeta
var infraBackupAge string
var events []store.Event
var eventCounts map[string]int
if customer != nil {
history, _ = s.store.GetCustomerHistory(customerID, 24*time.Hour)
@@ -238,6 +240,8 @@ func (s *Server) handleCustomerUnified(w http.ResponseWriter, r *http.Request, c
if infraMeta != nil {
infraBackupAge = timeAgo(infraMeta.UpdatedAt)
}
events, _ = s.store.GetRecentEvents(customerID, 50)
eventCounts, _ = s.store.CountEventsBySeverity(customerID, time.Now().Add(-24*time.Hour))
}
type pageData struct {
@@ -270,6 +274,9 @@ func (s *Server) handleCustomerUnified(w http.ResponseWriter, r *http.Request, c
RecentNotifications []store.NotificationLogEntry
History []store.CustomerSummary
Events []store.Event
EventCounts map[string]int // severity → count (last 24h)
Flash string
ActiveNav string
}
@@ -304,6 +311,9 @@ func (s *Server) handleCustomerUnified(w http.ResponseWriter, r *http.Request, c
RecentNotifications: recentNotifs,
History: history,
Events: events,
EventCounts: eventCounts,
Flash: r.URL.Query().Get("flash"),
ActiveNav: "configs",
}
@@ -697,7 +707,7 @@ func buildConfigJSON(r *http.Request) string {
overrides["git"] = git
}
// Monitoring UUIDs
// Monitoring UUIDs (legacy — only written if user explicitly provides values)
uuids := make(map[string]interface{})
for _, key := range []string{"heartbeat", "system_health", "db_dump", "backup", "backup_integrity"} {
if v := strings.TrimSpace(r.FormValue("uuid_" + key)); v != "" {
+6 -6
View File
@@ -80,12 +80,12 @@ backup:
monitoring:
enabled: true
healthchecks_base: "https://status.felhom.eu"
ping_uuids:
heartbeat: "" # Every 5 min — controller process alive
system_health: "" # Every 5 min — comprehensive system check
db_dump: "" # Daily — after database dumps
backup: "" # Daily — after restic snapshot
backup_integrity: "" # Weekly (Sunday) — restic check
# ping_uuids: (deprecated — monitoring is now handled by the Hub event system)
# heartbeat: ""
# system_health: ""
# db_dump: ""
# backup: ""
# backup_integrity: ""
system_health_interval: "5m"
health_check_schedule: "06:00"
thresholds:
+15
View File
@@ -40,6 +40,13 @@ func New(store *store.Store, passwordHash, apiKey string, staleThreshold time.Du
b, _ := json.Marshal(v)
return template.JS(b)
},
"add": func(a, b int) int { return a + b },
"mapGet": func(m map[string]int, key string) int {
if m == nil {
return 0
}
return m[key]
},
}
tmpl := template.Must(template.New("").Funcs(funcMap).ParseFS(templateFS, "templates/*.html"))
@@ -232,6 +239,8 @@ func (s *Server) handleDashboard(w http.ResponseWriter, r *http.Request) {
store.CustomerSummary
OverallStatus string // "ok", "warn", "down", "pending"
BackupAge string
EventErrors int
EventWarnings int
}
// Build map of report customers keyed by ID
@@ -266,6 +275,12 @@ func (s *Server) handleDashboard(w http.ResponseWriter, r *http.Request) {
dc.BackupAge = ""
}
// Event counts (last 24h)
if counts, err := s.store.CountEventsBySeverity(c.CustomerID, time.Now().Add(-24*time.Hour)); err == nil {
dc.EventErrors = counts["error"]
dc.EventWarnings = counts["warning"]
}
data = append(data, dc)
}
+3 -2
View File
@@ -96,8 +96,9 @@
</div>
</details>
<details class="card" {{if index .Overrides "monitoring"}}open{{end}}>
<summary><h2 style="display:inline">Monitoring UUIDs</h2></summary>
<details class="card">
<summary><h2 style="display:inline">Monitoring UUIDs</h2> <span class="severity-badge severity-warning" style="font-size: 0.7em; vertical-align: middle;">Legacy</span></summary>
<p class="text-muted" style="margin: 0.5rem 0;">Healthchecks ping UUIDs are deprecated. Monitoring is now handled natively by the Hub event system. These fields are kept for backward compatibility with older controllers.</p>
<div class="form-grid" style="margin-top: 1rem;">
{{$uuids := ""}}
{{with .Overrides}}{{with index . "monitoring"}}{{with index . "ping_uuids"}}{{$uuids = .}}{{end}}{{end}}{{end}}
@@ -403,6 +403,61 @@
{{end}}
</section>
<!-- Events -->
<section class="card">
<h2>Events
{{if .EventCounts}}
{{with mapGet .EventCounts "error"}}<span class="severity-badge severity-error">{{.}} error{{if gt . 1}}s{{end}}</span>{{end}}
{{with mapGet .EventCounts "warning"}}<span class="severity-badge severity-warning">{{.}} warning{{if gt . 1}}s{{end}}</span>{{end}}
{{end}}
<span class="text-muted" style="font-size: 0.7em; font-weight: normal;"> (last 24h)</span>
</h2>
{{if .Events}}
<div style="margin-bottom: 0.5rem;">
<button class="btn btn-sm btn-outline event-filter active" data-filter="all">All</button>
<button class="btn btn-sm btn-outline event-filter" data-filter="error">Errors</button>
<button class="btn btn-sm btn-outline event-filter" data-filter="warning">Warnings</button>
<button class="btn btn-sm btn-outline event-filter" data-filter="info">Info</button>
</div>
<table class="history-table" id="events-table">
<thead>
<tr>
<th>Time</th>
<th>Severity</th>
<th>Type</th>
<th>Message</th>
<th>Source</th>
</tr>
</thead>
<tbody>
{{range .Events}}
<tr data-severity="{{.Severity}}">
<td title="{{.CreatedAt.Format "2006-01-02 15:04:05"}}">{{.CreatedAt.Format "Jan 02 15:04"}}</td>
<td><span class="severity-badge severity-{{.Severity}}">{{.Severity}}</span></td>
<td><code>{{.EventType}}</code></td>
<td>{{.Message}}</td>
<td>{{.Source}}</td>
</tr>
{{end}}
</tbody>
</table>
<script>
document.querySelectorAll('.event-filter').forEach(btn => {
btn.addEventListener('click', function() {
document.querySelectorAll('.event-filter').forEach(b => b.classList.remove('active'));
this.classList.add('active');
const filter = this.dataset.filter;
document.querySelectorAll('#events-table tbody tr').forEach(row => {
row.style.display = (filter === 'all' || row.dataset.severity === filter) ? '' : 'none';
});
});
});
</script>
{{else}}
<p class="text-muted">No events recorded yet.</p>
{{end}}
</section>
<!-- Notifications -->
<section class="card">
<h2>Notifications</h2>
@@ -424,6 +479,7 @@
<thead>
<tr>
<th>Time</th>
<th>Channel</th>
<th>Event</th>
<th>Status</th>
<th>Message</th>
@@ -433,6 +489,7 @@
{{range .RecentNotifications}}
<tr>
<td>{{.CreatedAt.Format "Jan 02 15:04"}}</td>
<td><span class="status-badge status-badge-{{.Channel}}">{{.Channel}}</span></td>
<td>{{.EventType}}</td>
<td><span class="status-badge status-badge-{{.Status}}">{{.Status}}</span></td>
<td>{{.Message}}</td>
@@ -28,6 +28,7 @@
<tr>
<th>Customer</th>
<th>Status</th>
<th>Events</th>
<th>Last Seen</th>
<th>CPU</th>
<th>Memory</th>
@@ -49,6 +50,7 @@
{{if eq .OverallStatus "ok"}}OK{{else if eq .OverallStatus "warn"}}WARN{{else if eq .OverallStatus "disabled"}}PAUSED{{else if eq .OverallStatus "pending"}}PENDING{{else}}DOWN{{end}}
</span>
</td>
<td>{{if eq .OverallStatus "pending"}}—{{else}}{{if gt (add .EventErrors .EventWarnings) 0}}{{if gt .EventErrors 0}}<span class="severity-badge severity-error">{{.EventErrors}}</span>{{end}}{{if gt .EventWarnings 0}}<span class="severity-badge severity-warning">{{.EventWarnings}}</span>{{end}}{{else}}<span class="text-muted"></span>{{end}}{{end}}</td>
<td>{{if eq .OverallStatus "pending"}}—{{else}}{{timeAgo .ReceivedAt}}{{end}}</td>
<td>{{if eq .OverallStatus "pending"}}—{{else}}{{formatFloat .CPUPercent}}%{{end}}</td>
<td>{{if eq .OverallStatus "pending"}}—{{else}}{{formatFloat .MemoryPercent}}%{{end}}</td>
+49
View File
@@ -564,6 +564,55 @@ code {
color: var(--text-muted);
}
/* Severity badges */
.severity-badge {
display: inline-block;
padding: 0.15em 0.5em;
border-radius: 4px;
font-size: 0.8em;
font-weight: 600;
line-height: 1.4;
}
.severity-error {
background: rgba(239, 68, 68, 0.15);
color: #ef4444;
}
.severity-warning {
background: rgba(245, 158, 11, 0.15);
color: #f59e0b;
}
.severity-info {
background: rgba(59, 130, 246, 0.15);
color: #3b82f6;
}
/* Event filter buttons */
.event-filter {
font-size: 0.8em;
padding: 0.2em 0.6em;
cursor: pointer;
}
.event-filter.active {
background: var(--accent);
color: #fff;
border-color: var(--accent);
}
/* Notification channel badges */
.status-badge-operator {
background: rgba(139, 92, 246, 0.15);
color: #8b5cf6;
}
.status-badge-customer {
background: rgba(59, 130, 246, 0.15);
color: #3b82f6;
}
/* Responsive */
@media (max-width: 768px) {
.container { padding: 1rem; }