feat(hub): host-domain ingest — tables + /host-report + per-host auth + host dead-man's-switch (v0.7.0, slice 3)

Purely additive; the controller path (reports/customer_configs/checkAuthCustomer/
existing checkers) is untouched. Cutover remains slice 10.

- store: new hosts/guests/host_reports tables (full schema incl. columns INERT
  until slice 10, so no later ALTER); GetHostByAPIKey/GetHost/ListHosts/UpsertHost/
  SaveHostReport/UpsertGuestFromReport (preserves inert cols)/GetHostStaleness/
  GuestID; Prune also prunes host_reports.
- api: checkAuthHost (sibling of checkAuthCustomer); POST /host-report (per-host
  Bearer, 4MiB, denorm + guest upsert, control envelope); POST /admin/hosts
  (PROVISIONAL global-key host mint); host_* event types registered.
- monitor: HostStalenessChecker sibling over host_reports (host_stale/down/
  recovered), wired on the existing 60s ticker; controller checkers unchanged.
- tests (hermetic): store intent/inert-column preservation, auth, ingest
  (envelope+denorm, mismatch/unknown/blocked/oversize), admin mint round-trip,
  host staleness transitions.

CHANGELOG v0.7.0. Contract matches the agent host-report spec field-for-field.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-08 16:36:16 +02:00
parent 0d832def7b
commit 7c0c75457f
12 changed files with 1204 additions and 38 deletions
+229 -9
View File
@@ -89,6 +89,30 @@ func (h *Handler) checkAuthCustomer(r *http.Request) (customerID string, isGloba
return cfg.CustomerID, false, true
}
// checkAuthHost resolves a Bearer token to a HOST identity (the agent's auth
// path). It is a sibling of checkAuthCustomer — the controller path is unchanged.
// - global key -> ("", "", true, true) caller trusts body.host_id
// - per-host key -> (hostID, customerID, false, true)
// - failure -> ("", "", false, false)
func (h *Handler) checkAuthHost(r *http.Request) (hostID, customerID string, isGlobal, ok bool) {
auth := r.Header.Get("Authorization")
if !strings.HasPrefix(auth, "Bearer ") {
return "", "", false, false
}
token := strings.TrimPrefix(auth, "Bearer ")
// Global key first (same constant-time compare as checkAuthCustomer).
if h.apiKey != "" && subtle.ConstantTimeCompare([]byte(token), []byte(h.apiKey)) == 1 {
return "", "", true, true
}
host, err := h.store.GetHostByAPIKey(token)
if err != nil || host == nil {
return "", "", false, false
}
return host.HostID, host.CustomerID, false, true
}
// ServeHTTP routes API requests.
func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
path := strings.TrimPrefix(r.URL.Path, "/api/v1")
@@ -96,6 +120,10 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodPost && path == "/report":
h.handleReport(w, r)
case r.Method == http.MethodPost && path == "/host-report":
h.handleHostReport(w, r)
case r.Method == http.MethodPost && path == "/admin/hosts":
h.handleAdminCreateHost(w, r)
case r.Method == http.MethodPost && path == "/event":
h.handleEvent(w, r)
case r.Method == http.MethodPost && path == "/notify":
@@ -194,6 +222,194 @@ func (h *Handler) handleReport(w http.ResponseWriter, r *http.Request) {
json.NewEncoder(w).Encode(resp)
}
// defaultHostPollSeconds is the cadence the hub hands every agent this slice (no
// per-host override UI yet — that is a later slice).
const defaultHostPollSeconds = 900
// hostReportPayload is the subset of the agent host-report (slice-3 contract,
// §3 / agent spec §4) the hub needs for denorm + guest reality. Unknown fields
// (storage_targets/backups/restore_tests/pbs_snapshots/audit_tail) are ignored,
// so an empty or absent collection is accepted without error.
type hostReportPayload struct {
HostID string `json:"host_id"`
AgentVersion string `json:"agent_version"`
Host struct {
CPUPercent float64 `json:"cpu_percent"`
MemoryPercent float64 `json:"memory_percent"`
DiskPercent float64 `json:"disk_percent"`
} `json:"host"`
Guests []struct {
VMID int `json:"vmid"`
Name string `json:"name"`
Status string `json:"status"`
ControllerVersion string `json:"controller_version"`
} `json:"guests"`
Cloudflared struct {
Status string `json:"status"`
} `json:"cloudflared"`
}
// handleHostReport ingests the agent's host-report (the heartbeat) and returns the
// control envelope (agent spec §5).
func (h *Handler) handleHostReport(w http.ResponseWriter, r *http.Request) {
hostID, custID, isGlobal, ok := h.checkAuthHost(r)
if !ok {
http.Error(w, "Unauthorized", http.StatusUnauthorized)
return
}
// 4 MiB: host reports carry the full guest list + future storage/backup arrays;
// the controller path's 1 MiB is too tight here.
body, err := io.ReadAll(io.LimitReader(r.Body, 4<<20))
if err != nil {
http.Error(w, "Bad request", http.StatusBadRequest)
return
}
var rep hostReportPayload
if err := json.Unmarshal(body, &rep); err != nil || rep.HostID == "" {
http.Error(w, "Invalid payload: host_id required", http.StatusBadRequest)
return
}
if isGlobal {
// Global-key bootstrap: trust body.host_id but require the host to exist
// (it must be minted first) and resolve its customer from the row.
host, err := h.store.GetHost(rep.HostID)
if err != nil {
h.logger.Printf("[ERROR] host lookup failed for %s: %v", rep.HostID, err)
http.Error(w, "Internal error", http.StatusInternalServerError)
return
}
if host == nil {
http.Error(w, "Unknown host_id (mint via /admin/hosts first)", http.StatusBadRequest)
return
}
hostID, custID = rep.HostID, host.CustomerID
} else if rep.HostID != hostID {
http.Error(w, "Forbidden: host_id mismatch", http.StatusForbidden)
return
}
running := 0
for _, g := range rep.Guests {
if g.Status == "running" {
running++
}
}
denorm := store.HostReportDenorm{
AgentVersion: rep.AgentVersion,
CPUPercent: rep.Host.CPUPercent,
MemoryPercent: rep.Host.MemoryPercent,
DiskPercent: rep.Host.DiskPercent,
GuestTotal: len(rep.Guests),
GuestRunning: running,
CloudflaredStatus: rep.Cloudflared.Status,
}
if err := h.store.SaveHostReport(hostID, custID, body, denorm); err != nil {
h.logger.Printf("[ERROR] Failed to save host-report from %s: %v", hostID, err)
http.Error(w, "Internal error", http.StatusInternalServerError)
return
}
for _, g := range rep.Guests {
status := g.Status
if status == "" {
status = "unknown"
}
guest := &store.Guest{
GuestID: store.GuestID(hostID, g.VMID),
CustomerID: custID,
HostID: hostID,
VMID: g.VMID,
DisplayName: g.Name,
Status: status,
ControllerVersion: g.ControllerVersion,
}
if err := h.store.UpsertGuestFromReport(guest); err != nil {
// A guest upsert failure must not drop the whole report (liveness).
h.logger.Printf("[WARN] Failed to upsert guest %s: %v", guest.GuestID, err)
}
}
h.logger.Printf("[INFO] host-report from %s (%d guests, %d bytes)", hostID, len(rep.Guests), len(body))
blocked := false
if cc, err := h.store.GetCustomerConfig(custID); err == nil && cc != nil && cc.Status == "blocked" {
blocked = true
}
resp := map[string]interface{}{
"status": "ok",
"poll_interval_seconds": defaultHostPollSeconds,
"blocked": blocked,
"desired_generation": 0, // reserved (slice 4)
"has_signed_ops": false, // reserved (slice 4)
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(resp)
}
// handleAdminCreateHost mints a host identity (host_id + per-host api_key).
//
// PROVISIONAL (slice-3 bootstrap): global-key only, so the demo agent can
// authenticate before enrollment (slices 78) exists. Enrollment will mint host
// identity + pin signing keys; this endpoint should be removed/locked down then
// (tracked under doc 05 §11 auth-tightening at cutover).
func (h *Handler) handleAdminCreateHost(w http.ResponseWriter, r *http.Request) {
_, _, isGlobal, ok := h.checkAuthHost(r)
if !ok || !isGlobal {
http.Error(w, "Forbidden: global key required", http.StatusForbidden)
return
}
body, err := io.ReadAll(io.LimitReader(r.Body, 1<<20))
if err != nil {
http.Error(w, "Bad request", http.StatusBadRequest)
return
}
var req struct {
CustomerID string `json:"customer_id"`
HostID string `json:"host_id"`
DisplayName string `json:"display_name"`
}
if err := json.Unmarshal(body, &req); err != nil || req.CustomerID == "" {
http.Error(w, "Invalid payload: customer_id required", http.StatusBadRequest)
return
}
cc, err := h.store.GetCustomerConfig(req.CustomerID)
if err != nil {
http.Error(w, "Internal error", http.StatusInternalServerError)
return
}
if cc == nil {
http.Error(w, "Unknown customer_id", http.StatusBadRequest)
return
}
hostID := req.HostID
if hostID == "" {
sfx, err := configgen.RandomHex(3) // 6 hex chars — human-legible for the demo
if err != nil {
http.Error(w, "Internal error", http.StatusInternalServerError)
return
}
hostID = req.CustomerID + "-" + sfx
}
apiKey, err := configgen.RandomHex(32)
if err != nil {
http.Error(w, "Internal error", http.StatusInternalServerError)
return
}
if err := h.store.UpsertHost(&store.Host{HostID: hostID, CustomerID: req.CustomerID, APIKey: apiKey}); err != nil {
h.logger.Printf("[ERROR] Failed to mint host for %s: %v", req.CustomerID, err)
http.Error(w, "Internal error", http.StatusInternalServerError)
return
}
h.logger.Printf("[INFO] provisional host mint: %s (customer %s)", hostID, req.CustomerID)
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusCreated)
json.NewEncoder(w).Encode(map[string]string{"host_id": hostID, "api_key": apiKey})
}
// allowedEventTypes lists all valid event_type values the Hub accepts.
var allowedEventTypes = map[string]bool{
// Controller-pushed events
@@ -219,11 +435,15 @@ var allowedEventTypes = map[string]bool{
"disaster_recovery_started": true,
"disaster_recovery_completed": true,
// Hub-generated events
"node_stale": true,
"node_down": true,
"node_recovered": true,
"expected_backup_missed": true,
"expected_dbdump_missed": true,
"node_stale": true,
"node_down": true,
"node_recovered": true,
// Hub-generated host-domain events (v0.7.0, slice 3)
"host_stale": true,
"host_down": true,
"host_recovered": true,
"expected_backup_missed": true,
"expected_dbdump_missed": true,
// Special
"test": true,
}
@@ -686,10 +906,10 @@ func (h *Handler) handleRecovery(w http.ResponseWriter, r *http.Request, custome
}
resp := struct {
CustomerID string `json:"customer_id"`
ConfigYAML string `json:"config_yaml"`
InfraBackup json.RawMessage `json:"infra_backup"`
HasInfraBackup bool `json:"has_infra_backup"`
CustomerID string `json:"customer_id"`
ConfigYAML string `json:"config_yaml"`
InfraBackup json.RawMessage `json:"infra_backup"`
HasInfraBackup bool `json:"has_infra_backup"`
BackupVersions []store.InfraBackupVersion `json:"backup_versions,omitempty"`
}{
CustomerID: customerID,