feat: Hub monitoring takeover — event system, dead man's switch, notifications (v0.3.0)

Replace external Healthchecks.io with Hub-native monitoring. New events
table + /api/v1/event endpoint for structured events from controllers.
Staleness checker (60s) detects unresponsive nodes. Backup deadline
checker (daily 05:00) catches missed backups. Notification dispatcher
sends operator (English) + customer (Hungarian) emails via Resend with
per-event cooldowns. Event timeline on customer page, dashboard badges.
Config form deprecates Monitoring UUIDs section.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-20 18:53:24 +01:00
parent b4cb92e09f
commit 3217cb4751
16 changed files with 1319 additions and 64 deletions
+225 -12
View File
@@ -121,6 +121,33 @@ func (s *Store) migrate() error {
// v0.2.1: add status column to customer_configs (idempotent)
s.db.Exec("ALTER TABLE customer_configs ADD COLUMN status TEXT NOT NULL DEFAULT 'active'")
// v0.3.0: events table for hub-native monitoring
_, err = s.db.Exec(`
CREATE TABLE IF NOT EXISTS events (
id INTEGER PRIMARY KEY AUTOINCREMENT,
customer_id TEXT NOT NULL,
event_type TEXT NOT NULL,
severity TEXT NOT NULL DEFAULT 'info',
message TEXT NOT NULL DEFAULT '',
details_json TEXT NOT NULL DEFAULT '{}',
source TEXT NOT NULL DEFAULT 'controller',
created_at DATETIME NOT NULL DEFAULT (datetime('now'))
);
CREATE INDEX IF NOT EXISTS idx_events_customer_created
ON events(customer_id, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_events_type
ON events(event_type, created_at DESC);
`)
if err != nil {
return err
}
// v0.3.0: add cooldown_hours to customer_notifications (idempotent)
s.db.Exec("ALTER TABLE customer_notifications ADD COLUMN cooldown_hours INTEGER DEFAULT 6")
// v0.3.0: add channel column to notification_log (idempotent)
s.db.Exec("ALTER TABLE notification_log ADD COLUMN channel TEXT NOT NULL DEFAULT 'customer'")
return nil
}
@@ -129,15 +156,17 @@ type NotificationPrefs struct {
CustomerID string
Email string
EnabledEvents []string
CooldownHours int
}
// GetNotificationPrefs returns notification preferences for a customer.
func (s *Store) GetNotificationPrefs(customerID string) (*NotificationPrefs, error) {
var email, eventsJSON string
var cooldownHours int
err := s.db.QueryRow(
"SELECT email, enabled_events FROM customer_notifications WHERE customer_id = ?",
"SELECT email, enabled_events, COALESCE(cooldown_hours, 6) FROM customer_notifications WHERE customer_id = ?",
customerID,
).Scan(&email, &eventsJSON)
).Scan(&email, &eventsJSON, &cooldownHours)
if err != nil {
if err == sql.ErrNoRows {
return nil, nil
@@ -150,34 +179,46 @@ func (s *Store) GetNotificationPrefs(customerID string) (*NotificationPrefs, err
s.logger.Printf("[WARN] Corrupt enabled_events JSON for %s: %v", customerID, err)
}
if cooldownHours <= 0 {
cooldownHours = 6
}
return &NotificationPrefs{
CustomerID: customerID,
Email: email,
EnabledEvents: events,
CooldownHours: cooldownHours,
}, nil
}
// SaveNotificationPrefs creates or updates notification preferences for a customer.
func (s *Store) SaveNotificationPrefs(customerID, email string, enabledEvents []string) error {
func (s *Store) SaveNotificationPrefs(customerID, email string, enabledEvents []string, cooldownHours int) error {
eventsJSON, _ := json.Marshal(enabledEvents)
if cooldownHours <= 0 {
cooldownHours = 6
}
_, err := s.db.Exec(`
INSERT INTO customer_notifications (customer_id, email, enabled_events, updated_at)
VALUES (?, ?, ?, datetime('now'))
INSERT INTO customer_notifications (customer_id, email, enabled_events, cooldown_hours, updated_at)
VALUES (?, ?, ?, ?, datetime('now'))
ON CONFLICT(customer_id) DO UPDATE SET
email = excluded.email,
enabled_events = excluded.enabled_events,
cooldown_hours = excluded.cooldown_hours,
updated_at = datetime('now')`,
customerID, email, string(eventsJSON),
customerID, email, string(eventsJSON), cooldownHours,
)
return err
}
// LogNotification records a notification attempt.
func (s *Store) LogNotification(customerID, eventType, severity, message, status, errorMsg string) error {
func (s *Store) LogNotification(customerID, eventType, severity, message, status, errorMsg, channel string) error {
if channel == "" {
channel = "customer"
}
_, err := s.db.Exec(`
INSERT INTO notification_log (customer_id, event_type, severity, message, status, error_message)
VALUES (?, ?, ?, ?, ?, ?)`,
customerID, eventType, severity, message, status, errorMsg,
INSERT INTO notification_log (customer_id, event_type, severity, message, status, error_message, channel)
VALUES (?, ?, ?, ?, ?, ?, ?)`,
customerID, eventType, severity, message, status, errorMsg, channel,
)
return err
}
@@ -189,13 +230,14 @@ type NotificationLogEntry struct {
Message string
Status string // "sent", "skipped", "failed"
ErrorMessage string
Channel string // "operator" or "customer"
CreatedAt time.Time
}
// GetRecentNotifications returns the most recent notification log entries for a customer.
func (s *Store) GetRecentNotifications(customerID string, limit int) ([]NotificationLogEntry, error) {
rows, err := s.db.Query(`
SELECT event_type, severity, message, status, COALESCE(error_message, ''), created_at
SELECT event_type, severity, message, status, COALESCE(error_message, ''), COALESCE(channel, 'customer'), created_at
FROM notification_log
WHERE customer_id = ?
ORDER BY created_at DESC
@@ -209,7 +251,7 @@ func (s *Store) GetRecentNotifications(customerID string, limit int) ([]Notifica
for rows.Next() {
var e NotificationLogEntry
var createdAt, errorMsg string
if err := rows.Scan(&e.EventType, &e.Severity, &e.Message, &e.Status, &errorMsg, &createdAt); err != nil {
if err := rows.Scan(&e.EventType, &e.Severity, &e.Message, &e.Status, &errorMsg, &e.Channel, &createdAt); err != nil {
return nil, err
}
e.CreatedAt = parseSQLiteTime(createdAt)
@@ -658,6 +700,177 @@ func (s *Store) UpdateRetrievalPassword(customerID, newPassword string) error {
return err
}
// --- Event system ---
// Event represents a single event record.
type Event struct {
ID int64
CustomerID string
EventType string
Severity string // "info", "warning", "error"
Message string
DetailsJSON string // raw JSON
Source string // "controller" or "hub"
CreatedAt time.Time
}
// SaveEvent inserts a new event and returns its ID.
func (s *Store) SaveEvent(customerID, eventType, severity, message, detailsJSON, source string) (int64, error) {
if detailsJSON == "" {
detailsJSON = "{}"
}
if source == "" {
source = "controller"
}
res, err := s.db.Exec(`
INSERT INTO events (customer_id, event_type, severity, message, details_json, source)
VALUES (?, ?, ?, ?, ?, ?)`,
customerID, eventType, severity, message, detailsJSON, source,
)
if err != nil {
return 0, err
}
return res.LastInsertId()
}
// GetRecentEvents returns the most recent events for a customer, newest first.
func (s *Store) GetRecentEvents(customerID string, limit int) ([]Event, error) {
rows, err := s.db.Query(`
SELECT id, customer_id, event_type, severity, message, details_json, source, created_at
FROM events
WHERE customer_id = ?
ORDER BY created_at DESC
LIMIT ?`, customerID, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanEvents(rows)
}
// GetEventsByType returns events of a specific type for a customer since a given time.
func (s *Store) GetEventsByType(customerID, eventType string, since time.Time) ([]Event, error) {
rows, err := s.db.Query(`
SELECT id, customer_id, event_type, severity, message, details_json, source, created_at
FROM events
WHERE customer_id = ? AND event_type = ? AND created_at >= ?
ORDER BY created_at DESC`,
customerID, eventType, since.UTC().Format("2006-01-02 15:04:05"))
if err != nil {
return nil, err
}
defer rows.Close()
return scanEvents(rows)
}
// GetLatestEventByType returns the most recent event of a given type for a customer.
func (s *Store) GetLatestEventByType(customerID, eventType string) (*Event, error) {
var e Event
var createdAt string
err := s.db.QueryRow(`
SELECT id, customer_id, event_type, severity, message, details_json, source, created_at
FROM events
WHERE customer_id = ? AND event_type = ?
ORDER BY created_at DESC
LIMIT 1`, customerID, eventType,
).Scan(&e.ID, &e.CustomerID, &e.EventType, &e.Severity, &e.Message, &e.DetailsJSON, &e.Source, &createdAt)
if err == sql.ErrNoRows {
return nil, nil
}
if err != nil {
return nil, err
}
e.CreatedAt = parseSQLiteTime(createdAt)
return &e, nil
}
// GetAllRecentEvents returns the most recent events across all customers.
func (s *Store) GetAllRecentEvents(limit int) ([]Event, error) {
rows, err := s.db.Query(`
SELECT id, customer_id, event_type, severity, message, details_json, source, created_at
FROM events
ORDER BY created_at DESC
LIMIT ?`, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanEvents(rows)
}
// CountEventsBySeverity returns a count of events per severity for a customer since a given time.
func (s *Store) CountEventsBySeverity(customerID string, since time.Time) (map[string]int, error) {
rows, err := s.db.Query(`
SELECT severity, COUNT(*) FROM events
WHERE customer_id = ? AND created_at >= ?
GROUP BY severity`,
customerID, since.UTC().Format("2006-01-02 15:04:05"))
if err != nil {
return nil, err
}
defer rows.Close()
counts := make(map[string]int)
for rows.Next() {
var sev string
var count int
if err := rows.Scan(&sev, &count); err != nil {
return nil, err
}
counts[sev] = count
}
return counts, rows.Err()
}
// PruneEvents deletes events older than the given number of days.
func (s *Store) PruneEvents(maxDays int) (int64, error) {
cutoff := time.Now().AddDate(0, 0, -maxDays).UTC().Format("2006-01-02 15:04:05")
res, err := s.db.Exec("DELETE FROM events WHERE created_at < ?", cutoff)
if err != nil {
return 0, err
}
return res.RowsAffected()
}
// GetActiveCustomerIDs returns customer IDs from customer_configs where status is 'active'.
func (s *Store) GetActiveCustomerIDs() ([]string, error) {
rows, err := s.db.Query("SELECT customer_id FROM customer_configs WHERE status = 'active'")
if err != nil {
return nil, err
}
defer rows.Close()
var ids []string
for rows.Next() {
var id string
if err := rows.Scan(&id); err != nil {
return nil, err
}
ids = append(ids, id)
}
return ids, rows.Err()
}
// Ping verifies the database is accessible.
func (s *Store) Ping() error {
var n int
return s.db.QueryRow("SELECT 1").Scan(&n)
}
func scanEvents(rows *sql.Rows) ([]Event, error) {
var events []Event
for rows.Next() {
var e Event
var createdAt string
if err := rows.Scan(&e.ID, &e.CustomerID, &e.EventType, &e.Severity, &e.Message, &e.DetailsJSON, &e.Source, &createdAt); err != nil {
return nil, err
}
e.CreatedAt = parseSQLiteTime(createdAt)
events = append(events, e)
}
return events, rows.Err()
}
// parseSQLiteTime tries multiple formats that modernc.org/sqlite may return.
func parseSQLiteTime(s string) time.Time {
formats := []string{