feat: Hub monitoring takeover — event system, dead man's switch, notifications (v0.3.0)
Replace external Healthchecks.io with Hub-native monitoring. New events table + /api/v1/event endpoint for structured events from controllers. Staleness checker (60s) detects unresponsive nodes. Backup deadline checker (daily 05:00) catches missed backups. Notification dispatcher sends operator (English) + customer (Hungarian) emails via Resend with per-event cooldowns. Event timeline on customer page, dashboard badges. Config form deprecates Monitoring UUIDs section. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -223,12 +223,14 @@ func (s *Server) handleCustomerUnified(w http.ResponseWriter, r *http.Request, c
|
||||
}
|
||||
}
|
||||
|
||||
// History, notifications, infra backup
|
||||
// History, notifications, events, infra backup
|
||||
var history []store.CustomerSummary
|
||||
var notifPrefs *store.NotificationPrefs
|
||||
var recentNotifs []store.NotificationLogEntry
|
||||
var infraMeta *store.InfraBackupMeta
|
||||
var infraBackupAge string
|
||||
var events []store.Event
|
||||
var eventCounts map[string]int
|
||||
|
||||
if customer != nil {
|
||||
history, _ = s.store.GetCustomerHistory(customerID, 24*time.Hour)
|
||||
@@ -238,6 +240,8 @@ func (s *Server) handleCustomerUnified(w http.ResponseWriter, r *http.Request, c
|
||||
if infraMeta != nil {
|
||||
infraBackupAge = timeAgo(infraMeta.UpdatedAt)
|
||||
}
|
||||
events, _ = s.store.GetRecentEvents(customerID, 50)
|
||||
eventCounts, _ = s.store.CountEventsBySeverity(customerID, time.Now().Add(-24*time.Hour))
|
||||
}
|
||||
|
||||
type pageData struct {
|
||||
@@ -270,6 +274,9 @@ func (s *Server) handleCustomerUnified(w http.ResponseWriter, r *http.Request, c
|
||||
RecentNotifications []store.NotificationLogEntry
|
||||
History []store.CustomerSummary
|
||||
|
||||
Events []store.Event
|
||||
EventCounts map[string]int // severity → count (last 24h)
|
||||
|
||||
Flash string
|
||||
ActiveNav string
|
||||
}
|
||||
@@ -304,6 +311,9 @@ func (s *Server) handleCustomerUnified(w http.ResponseWriter, r *http.Request, c
|
||||
RecentNotifications: recentNotifs,
|
||||
History: history,
|
||||
|
||||
Events: events,
|
||||
EventCounts: eventCounts,
|
||||
|
||||
Flash: r.URL.Query().Get("flash"),
|
||||
ActiveNav: "configs",
|
||||
}
|
||||
@@ -697,7 +707,7 @@ func buildConfigJSON(r *http.Request) string {
|
||||
overrides["git"] = git
|
||||
}
|
||||
|
||||
// Monitoring UUIDs
|
||||
// Monitoring UUIDs (legacy — only written if user explicitly provides values)
|
||||
uuids := make(map[string]interface{})
|
||||
for _, key := range []string{"heartbeat", "system_health", "db_dump", "backup", "backup_integrity"} {
|
||||
if v := strings.TrimSpace(r.FormValue("uuid_" + key)); v != "" {
|
||||
|
||||
@@ -80,12 +80,12 @@ backup:
|
||||
monitoring:
|
||||
enabled: true
|
||||
healthchecks_base: "https://status.felhom.eu"
|
||||
ping_uuids:
|
||||
heartbeat: "" # Every 5 min — controller process alive
|
||||
system_health: "" # Every 5 min — comprehensive system check
|
||||
db_dump: "" # Daily — after database dumps
|
||||
backup: "" # Daily — after restic snapshot
|
||||
backup_integrity: "" # Weekly (Sunday) — restic check
|
||||
# ping_uuids: (deprecated — monitoring is now handled by the Hub event system)
|
||||
# heartbeat: ""
|
||||
# system_health: ""
|
||||
# db_dump: ""
|
||||
# backup: ""
|
||||
# backup_integrity: ""
|
||||
system_health_interval: "5m"
|
||||
health_check_schedule: "06:00"
|
||||
thresholds:
|
||||
|
||||
@@ -40,6 +40,13 @@ func New(store *store.Store, passwordHash, apiKey string, staleThreshold time.Du
|
||||
b, _ := json.Marshal(v)
|
||||
return template.JS(b)
|
||||
},
|
||||
"add": func(a, b int) int { return a + b },
|
||||
"mapGet": func(m map[string]int, key string) int {
|
||||
if m == nil {
|
||||
return 0
|
||||
}
|
||||
return m[key]
|
||||
},
|
||||
}
|
||||
|
||||
tmpl := template.Must(template.New("").Funcs(funcMap).ParseFS(templateFS, "templates/*.html"))
|
||||
@@ -232,6 +239,8 @@ func (s *Server) handleDashboard(w http.ResponseWriter, r *http.Request) {
|
||||
store.CustomerSummary
|
||||
OverallStatus string // "ok", "warn", "down", "pending"
|
||||
BackupAge string
|
||||
EventErrors int
|
||||
EventWarnings int
|
||||
}
|
||||
|
||||
// Build map of report customers keyed by ID
|
||||
@@ -266,6 +275,12 @@ func (s *Server) handleDashboard(w http.ResponseWriter, r *http.Request) {
|
||||
dc.BackupAge = "–"
|
||||
}
|
||||
|
||||
// Event counts (last 24h)
|
||||
if counts, err := s.store.CountEventsBySeverity(c.CustomerID, time.Now().Add(-24*time.Hour)); err == nil {
|
||||
dc.EventErrors = counts["error"]
|
||||
dc.EventWarnings = counts["warning"]
|
||||
}
|
||||
|
||||
data = append(data, dc)
|
||||
}
|
||||
|
||||
|
||||
@@ -96,8 +96,9 @@
|
||||
</div>
|
||||
</details>
|
||||
|
||||
<details class="card" {{if index .Overrides "monitoring"}}open{{end}}>
|
||||
<summary><h2 style="display:inline">Monitoring UUIDs</h2></summary>
|
||||
<details class="card">
|
||||
<summary><h2 style="display:inline">Monitoring UUIDs</h2> <span class="severity-badge severity-warning" style="font-size: 0.7em; vertical-align: middle;">Legacy</span></summary>
|
||||
<p class="text-muted" style="margin: 0.5rem 0;">Healthchecks ping UUIDs are deprecated. Monitoring is now handled natively by the Hub event system. These fields are kept for backward compatibility with older controllers.</p>
|
||||
<div class="form-grid" style="margin-top: 1rem;">
|
||||
{{$uuids := ""}}
|
||||
{{with .Overrides}}{{with index . "monitoring"}}{{with index . "ping_uuids"}}{{$uuids = .}}{{end}}{{end}}{{end}}
|
||||
|
||||
@@ -403,6 +403,61 @@
|
||||
{{end}}
|
||||
</section>
|
||||
|
||||
<!-- Events -->
|
||||
<section class="card">
|
||||
<h2>Events
|
||||
{{if .EventCounts}}
|
||||
{{with mapGet .EventCounts "error"}}<span class="severity-badge severity-error">{{.}} error{{if gt . 1}}s{{end}}</span>{{end}}
|
||||
{{with mapGet .EventCounts "warning"}}<span class="severity-badge severity-warning">{{.}} warning{{if gt . 1}}s{{end}}</span>{{end}}
|
||||
{{end}}
|
||||
<span class="text-muted" style="font-size: 0.7em; font-weight: normal;"> (last 24h)</span>
|
||||
</h2>
|
||||
{{if .Events}}
|
||||
<div style="margin-bottom: 0.5rem;">
|
||||
<button class="btn btn-sm btn-outline event-filter active" data-filter="all">All</button>
|
||||
<button class="btn btn-sm btn-outline event-filter" data-filter="error">Errors</button>
|
||||
<button class="btn btn-sm btn-outline event-filter" data-filter="warning">Warnings</button>
|
||||
<button class="btn btn-sm btn-outline event-filter" data-filter="info">Info</button>
|
||||
</div>
|
||||
<table class="history-table" id="events-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Time</th>
|
||||
<th>Severity</th>
|
||||
<th>Type</th>
|
||||
<th>Message</th>
|
||||
<th>Source</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{{range .Events}}
|
||||
<tr data-severity="{{.Severity}}">
|
||||
<td title="{{.CreatedAt.Format "2006-01-02 15:04:05"}}">{{.CreatedAt.Format "Jan 02 15:04"}}</td>
|
||||
<td><span class="severity-badge severity-{{.Severity}}">{{.Severity}}</span></td>
|
||||
<td><code>{{.EventType}}</code></td>
|
||||
<td>{{.Message}}</td>
|
||||
<td>{{.Source}}</td>
|
||||
</tr>
|
||||
{{end}}
|
||||
</tbody>
|
||||
</table>
|
||||
<script>
|
||||
document.querySelectorAll('.event-filter').forEach(btn => {
|
||||
btn.addEventListener('click', function() {
|
||||
document.querySelectorAll('.event-filter').forEach(b => b.classList.remove('active'));
|
||||
this.classList.add('active');
|
||||
const filter = this.dataset.filter;
|
||||
document.querySelectorAll('#events-table tbody tr').forEach(row => {
|
||||
row.style.display = (filter === 'all' || row.dataset.severity === filter) ? '' : 'none';
|
||||
});
|
||||
});
|
||||
});
|
||||
</script>
|
||||
{{else}}
|
||||
<p class="text-muted">No events recorded yet.</p>
|
||||
{{end}}
|
||||
</section>
|
||||
|
||||
<!-- Notifications -->
|
||||
<section class="card">
|
||||
<h2>Notifications</h2>
|
||||
@@ -424,6 +479,7 @@
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Time</th>
|
||||
<th>Channel</th>
|
||||
<th>Event</th>
|
||||
<th>Status</th>
|
||||
<th>Message</th>
|
||||
@@ -433,6 +489,7 @@
|
||||
{{range .RecentNotifications}}
|
||||
<tr>
|
||||
<td>{{.CreatedAt.Format "Jan 02 15:04"}}</td>
|
||||
<td><span class="status-badge status-badge-{{.Channel}}">{{.Channel}}</span></td>
|
||||
<td>{{.EventType}}</td>
|
||||
<td><span class="status-badge status-badge-{{.Status}}">{{.Status}}</span></td>
|
||||
<td>{{.Message}}</td>
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
<tr>
|
||||
<th>Customer</th>
|
||||
<th>Status</th>
|
||||
<th>Events</th>
|
||||
<th>Last Seen</th>
|
||||
<th>CPU</th>
|
||||
<th>Memory</th>
|
||||
@@ -49,6 +50,7 @@
|
||||
{{if eq .OverallStatus "ok"}}OK{{else if eq .OverallStatus "warn"}}WARN{{else if eq .OverallStatus "disabled"}}PAUSED{{else if eq .OverallStatus "pending"}}PENDING{{else}}DOWN{{end}}
|
||||
</span>
|
||||
</td>
|
||||
<td>{{if eq .OverallStatus "pending"}}—{{else}}{{if gt (add .EventErrors .EventWarnings) 0}}{{if gt .EventErrors 0}}<span class="severity-badge severity-error">{{.EventErrors}}</span>{{end}}{{if gt .EventWarnings 0}}<span class="severity-badge severity-warning">{{.EventWarnings}}</span>{{end}}{{else}}<span class="text-muted">—</span>{{end}}{{end}}</td>
|
||||
<td>{{if eq .OverallStatus "pending"}}—{{else}}{{timeAgo .ReceivedAt}}{{end}}</td>
|
||||
<td>{{if eq .OverallStatus "pending"}}—{{else}}{{formatFloat .CPUPercent}}%{{end}}</td>
|
||||
<td>{{if eq .OverallStatus "pending"}}—{{else}}{{formatFloat .MemoryPercent}}%{{end}}</td>
|
||||
|
||||
@@ -564,6 +564,55 @@ code {
|
||||
color: var(--text-muted);
|
||||
}
|
||||
|
||||
/* Severity badges */
|
||||
.severity-badge {
|
||||
display: inline-block;
|
||||
padding: 0.15em 0.5em;
|
||||
border-radius: 4px;
|
||||
font-size: 0.8em;
|
||||
font-weight: 600;
|
||||
line-height: 1.4;
|
||||
}
|
||||
|
||||
.severity-error {
|
||||
background: rgba(239, 68, 68, 0.15);
|
||||
color: #ef4444;
|
||||
}
|
||||
|
||||
.severity-warning {
|
||||
background: rgba(245, 158, 11, 0.15);
|
||||
color: #f59e0b;
|
||||
}
|
||||
|
||||
.severity-info {
|
||||
background: rgba(59, 130, 246, 0.15);
|
||||
color: #3b82f6;
|
||||
}
|
||||
|
||||
/* Event filter buttons */
|
||||
.event-filter {
|
||||
font-size: 0.8em;
|
||||
padding: 0.2em 0.6em;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.event-filter.active {
|
||||
background: var(--accent);
|
||||
color: #fff;
|
||||
border-color: var(--accent);
|
||||
}
|
||||
|
||||
/* Notification channel badges */
|
||||
.status-badge-operator {
|
||||
background: rgba(139, 92, 246, 0.15);
|
||||
color: #8b5cf6;
|
||||
}
|
||||
|
||||
.status-badge-customer {
|
||||
background: rgba(59, 130, 246, 0.15);
|
||||
color: #3b82f6;
|
||||
}
|
||||
|
||||
/* Responsive */
|
||||
@media (max-width: 768px) {
|
||||
.container { padding: 1rem; }
|
||||
|
||||
Reference in New Issue
Block a user