feat(hub): host-domain ingest — tables + /host-report + per-host auth + host dead-man's-switch (v0.7.0, slice 3)

Purely additive; the controller path (reports/customer_configs/checkAuthCustomer/
existing checkers) is untouched. Cutover remains slice 10.

- store: new hosts/guests/host_reports tables (full schema incl. columns INERT
  until slice 10, so no later ALTER); GetHostByAPIKey/GetHost/ListHosts/UpsertHost/
  SaveHostReport/UpsertGuestFromReport (preserves inert cols)/GetHostStaleness/
  GuestID; Prune also prunes host_reports.
- api: checkAuthHost (sibling of checkAuthCustomer); POST /host-report (per-host
  Bearer, 4MiB, denorm + guest upsert, control envelope); POST /admin/hosts
  (PROVISIONAL global-key host mint); host_* event types registered.
- monitor: HostStalenessChecker sibling over host_reports (host_stale/down/
  recovered), wired on the existing 60s ticker; controller checkers unchanged.
- tests (hermetic): store intent/inert-column preservation, auth, ingest
  (envelope+denorm, mismatch/unknown/blocked/oversize), admin mint round-trip,
  host staleness transitions.

CHANGELOG v0.7.0. Contract matches the agent host-report spec field-for-field.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-08 16:36:16 +02:00
parent 0d832def7b
commit 7c0c75457f
12 changed files with 1204 additions and 38 deletions
+176
View File
@@ -0,0 +1,176 @@
package monitor
import (
"log"
"sync"
"time"
"gitea.dooplex.hu/admin/felhom-hub/internal/store"
)
// HostStalenessChecker is the host-domain dead-man's-switch (v0.7.0, slice 3). It
// is a deliberate SIBLING of StalenessChecker, not a rename: during slices 39 the
// controller report stream (reports) and the agent host-report stream
// (host_reports) are both live, so both checkers run. It keys on host↔host_reports
// and emits host_stale / host_down / host_recovered. Merging is a slice-10 job.
//
// Events are attributed to the host's CUSTOMER (SaveEvent + onEvent take the
// customer_id) so the existing per-customer notification/event UX picks them up
// unchanged.
type HostStalenessChecker struct {
store *store.Store
threshold time.Duration // "stale" after this (default 30m — same as the controller checker)
downAfter time.Duration // "down" after this (2x threshold)
logger *log.Logger
onEvent EventNotifyFunc
mu sync.Mutex
states map[string]string // hostID → "ok" | "stale" | "down"
customerOf map[string]string // hostID → customerID (for event attribution)
downtimeStart map[string]time.Time // hostID → when it first became unreachable
}
// NewHostStalenessChecker creates the checker and seeds state from current
// host-report recency. No events are generated during initialization.
func NewHostStalenessChecker(s *store.Store, threshold time.Duration, onEvent EventNotifyFunc, logger *log.Logger) *HostStalenessChecker {
sc := &HostStalenessChecker{
store: s,
threshold: threshold,
downAfter: 2 * threshold,
logger: logger,
onEvent: onEvent,
states: make(map[string]string),
customerOf: make(map[string]string),
downtimeStart: make(map[string]time.Time),
}
rows, err := s.GetHostStaleness()
if err != nil {
logger.Printf("[WARN] Host staleness checker: failed to seed states: %v", err)
return sc
}
var okCount, staleCount, downCount int
for _, row := range rows {
if s.IsCustomerBlocked(row.CustomerID) {
continue
}
sc.customerOf[row.HostID] = row.CustomerID
age := time.Since(row.LastReportAt)
switch {
case age > sc.downAfter:
sc.states[row.HostID] = "down"
downCount++
case age > sc.threshold:
sc.states[row.HostID] = "stale"
staleCount++
default:
sc.states[row.HostID] = "ok"
okCount++
}
}
logger.Printf("[INFO] Host staleness checker initialized: %d ok, %d stale, %d down", okCount, staleCount, downCount)
return sc
}
// Check evaluates all hosts and emits events on state transitions. Call every 60s.
func (sc *HostStalenessChecker) Check() {
rows, err := sc.store.GetHostStaleness()
if err != nil {
sc.logger.Printf("[WARN] Host staleness check failed: %v", err)
return
}
sc.mu.Lock()
defer sc.mu.Unlock()
seen := make(map[string]bool, len(rows))
for _, row := range rows {
seen[row.HostID] = true
if sc.store.IsCustomerBlocked(row.CustomerID) {
delete(sc.states, row.HostID)
continue
}
sc.customerOf[row.HostID] = row.CustomerID
age := time.Since(row.LastReportAt)
var newState string
switch {
case age > sc.downAfter:
newState = "down"
case age > sc.threshold:
newState = "stale"
default:
newState = "ok"
}
oldState := sc.states[row.HostID]
if oldState == "" {
sc.states[row.HostID] = newState // first observation — no event
continue
}
if oldState == newState {
continue
}
sc.states[row.HostID] = newState
if newState == "stale" && oldState == "ok" {
sc.downtimeStart[row.HostID] = time.Now()
}
downtimeDur := age
if newState == "ok" {
if t, ok := sc.downtimeStart[row.HostID]; ok {
downtimeDur = time.Since(t)
}
delete(sc.downtimeStart, row.HostID)
}
sc.emitTransition(row.HostID, row.CustomerID, oldState, newState, downtimeDur)
}
for id := range sc.states {
if !seen[id] {
delete(sc.states, id)
delete(sc.downtimeStart, id)
}
}
}
// GetState returns the current staleness state for a host.
func (sc *HostStalenessChecker) GetState(hostID string) string {
sc.mu.Lock()
defer sc.mu.Unlock()
s := sc.states[hostID]
if s == "" {
return "unknown"
}
return s
}
func (sc *HostStalenessChecker) emitTransition(hostID, customerID, oldState, newState string, age time.Duration) {
var eventType, severity, message string
switch {
case newState == "stale":
eventType = "host_stale"
severity = "warning"
message = "Host " + hostID + ": no report for " + formatDuration(age)
case newState == "down":
eventType = "host_down"
severity = "error"
message = "Host " + hostID + ": no report for " + formatDuration(age)
case newState == "ok" && (oldState == "stale" || oldState == "down"):
eventType = "host_recovered"
severity = "info"
message = "Host " + hostID + ": reports resumed (was " + oldState + " for " + formatDuration(age) + ")"
default:
return
}
sc.logger.Printf("[INFO] Host staleness: %s %s → %s (%s)", hostID, oldState, newState, eventType)
if _, err := sc.store.SaveEvent(customerID, eventType, severity, message, "{}", "hub"); err != nil {
sc.logger.Printf("[WARN] Failed to save host staleness event for %s: %v", hostID, err)
return
}
if sc.onEvent != nil {
sc.onEvent(customerID, eventType, severity, message, "{}", "hub")
}
}
@@ -0,0 +1,88 @@
package monitor
import (
"database/sql"
"fmt"
"io"
"log"
"path/filepath"
"testing"
"time"
"gitea.dooplex.hu/admin/felhom-hub/internal/store"
_ "modernc.org/sqlite"
)
// backdate sets a host's last_report_at to N minutes ago, simulating the passage
// of time without sleeping. Uses a second connection (the checker reads via store).
func backdate(t *testing.T, db *sql.DB, hostID string, minutesAgo int) {
t.Helper()
if _, err := db.Exec(`UPDATE hosts SET last_report_at = datetime('now', ?) WHERE host_id = ?`,
fmt.Sprintf("-%d minutes", minutesAgo), hostID); err != nil {
t.Fatal(err)
}
}
func TestHostStalenessChecker(t *testing.T) {
path := filepath.Join(t.TempDir(), "test.db")
st, err := store.New(path, log.New(io.Discard, "", 0))
if err != nil {
t.Fatal(err)
}
defer st.Close()
db, _ := sql.Open("sqlite", path)
defer db.Close()
st.SaveCustomerConfig(&store.CustomerConfig{CustomerID: "c1", APIKey: "ck", RetrievalPassword: "p"})
st.UpsertHost(&store.Host{HostID: "h1", CustomerID: "c1", APIKey: "k1"})
st.SaveHostReport("h1", "c1", []byte(`{}`), store.HostReportDenorm{}) // sets last_report_at
var events []string
onEvent := func(customerID, eventType, severity, message, detailsJSON, source string) {
events = append(events, eventType)
}
// Seed already-stale (40m) → state stale, but NO event on init.
backdate(t, db, "h1", 40)
sc := NewHostStalenessChecker(st, 30*time.Minute, onEvent, log.New(io.Discard, "", 0))
if len(events) != 0 {
t.Fatalf("seed must not emit events, got %v", events)
}
if sc.GetState("h1") != "stale" {
t.Fatalf("seeded state = %q, want stale", sc.GetState("h1"))
}
// Same age → no transition.
sc.Check()
if len(events) != 0 {
t.Fatalf("no transition expected, got %v", events)
}
// Fresh report → host_recovered.
backdate(t, db, "h1", 2)
sc.Check()
if last(events) != "host_recovered" {
t.Fatalf("events = %v, want last host_recovered", events)
}
// Aged to stale → host_stale.
backdate(t, db, "h1", 40)
sc.Check()
if last(events) != "host_stale" {
t.Fatalf("events = %v, want last host_stale", events)
}
// Aged past 2× → host_down.
backdate(t, db, "h1", 130)
sc.Check()
if last(events) != "host_down" {
t.Fatalf("events = %v, want last host_down", events)
}
}
func last(s []string) string {
if len(s) == 0 {
return ""
}
return s[len(s)-1]
}