3217cb4751
Replace external Healthchecks.io with Hub-native monitoring. New events table + /api/v1/event endpoint for structured events from controllers. Staleness checker (60s) detects unresponsive nodes. Backup deadline checker (daily 05:00) catches missed backups. Notification dispatcher sends operator (English) + customer (Hungarian) emails via Resend with per-event cooldowns. Event timeline on customer page, dashboard badges. Config form deprecates Monitoring UUIDs section. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
87 lines
3.0 KiB
Go
87 lines
3.0 KiB
Go
package monitor
|
|
|
|
import (
|
|
"log"
|
|
"time"
|
|
|
|
"gitea.dooplex.hu/admin/felhom-hub/internal/store"
|
|
)
|
|
|
|
// budapest returns the Europe/Budapest timezone (cached).
|
|
var budapest *time.Location
|
|
|
|
func init() {
|
|
var err error
|
|
budapest, err = time.LoadLocation("Europe/Budapest")
|
|
if err != nil {
|
|
// Fallback: UTC+1 (CET base; DST handled by OS if available)
|
|
budapest = time.FixedZone("CET", 3600)
|
|
}
|
|
}
|
|
|
|
// CheckBackupDeadlines checks whether active customers had their expected
|
|
// daily backups and DB dumps. Runs once daily at 05:00 Budapest time.
|
|
//
|
|
// For each active customer, it checks for backup_completed and db_dump_completed
|
|
// events since Budapest midnight. If neither success nor failure events exist,
|
|
// it inserts expected_backup_missed / expected_dbdump_missed events.
|
|
//
|
|
// Customers whose nodes are "down" (no report in >1h) are skipped — they
|
|
// already have staleness events.
|
|
func CheckBackupDeadlines(s *store.Store, staleness *StalenessChecker, onEvent EventNotifyFunc, logger *log.Logger) {
|
|
customerIDs, err := s.GetActiveCustomerIDs()
|
|
if err != nil {
|
|
logger.Printf("[WARN] Deadline check: failed to get active customers: %v", err)
|
|
return
|
|
}
|
|
|
|
// Budapest midnight today
|
|
now := time.Now().In(budapest)
|
|
midnightBudapest := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, budapest)
|
|
sinceUTC := midnightBudapest.UTC()
|
|
|
|
var backupMissed, dbdumpMissed, skipped int
|
|
|
|
for _, id := range customerIDs {
|
|
// Skip nodes that are down — they already have staleness events
|
|
if staleness != nil && staleness.GetState(id) == "down" {
|
|
skipped++
|
|
continue
|
|
}
|
|
|
|
// Check blocked
|
|
if s.IsCustomerBlocked(id) {
|
|
continue
|
|
}
|
|
|
|
// Check backup_completed / backup_failed since midnight
|
|
backupOK, _ := s.GetEventsByType(id, "backup_completed", sinceUTC)
|
|
backupFailed, _ := s.GetEventsByType(id, "backup_failed", sinceUTC)
|
|
if len(backupOK) == 0 && len(backupFailed) == 0 {
|
|
msg := "No backup completed or failed since midnight"
|
|
if _, err := s.SaveEvent(id, "expected_backup_missed", "error", msg, "{}", "hub"); err != nil {
|
|
logger.Printf("[WARN] Failed to save expected_backup_missed for %s: %v", id, err)
|
|
} else if onEvent != nil {
|
|
onEvent(id, "expected_backup_missed", "error", msg, "{}", "hub")
|
|
}
|
|
backupMissed++
|
|
}
|
|
|
|
// Check db_dump_completed / db_dump_failed since midnight
|
|
dumpOK, _ := s.GetEventsByType(id, "db_dump_completed", sinceUTC)
|
|
dumpFailed, _ := s.GetEventsByType(id, "db_dump_failed", sinceUTC)
|
|
if len(dumpOK) == 0 && len(dumpFailed) == 0 {
|
|
msg := "No DB dump completed or failed since midnight"
|
|
if _, err := s.SaveEvent(id, "expected_dbdump_missed", "error", msg, "{}", "hub"); err != nil {
|
|
logger.Printf("[WARN] Failed to save expected_dbdump_missed for %s: %v", id, err)
|
|
} else if onEvent != nil {
|
|
onEvent(id, "expected_dbdump_missed", "error", msg, "{}", "hub")
|
|
}
|
|
dbdumpMissed++
|
|
}
|
|
}
|
|
|
|
logger.Printf("[INFO] Deadline check: %d customers, %d backup missed, %d dbdump missed, %d skipped (down)",
|
|
len(customerIDs), backupMissed, dbdumpMissed, skipped)
|
|
}
|