feat(hub): host-domain ingest — tables + /host-report + per-host auth + host dead-man's-switch (v0.7.0, slice 3)

Purely additive; the controller path (reports/customer_configs/checkAuthCustomer/
existing checkers) is untouched. Cutover remains slice 10.

- store: new hosts/guests/host_reports tables (full schema incl. columns INERT
  until slice 10, so no later ALTER); GetHostByAPIKey/GetHost/ListHosts/UpsertHost/
  SaveHostReport/UpsertGuestFromReport (preserves inert cols)/GetHostStaleness/
  GuestID; Prune also prunes host_reports.
- api: checkAuthHost (sibling of checkAuthCustomer); POST /host-report (per-host
  Bearer, 4MiB, denorm + guest upsert, control envelope); POST /admin/hosts
  (PROVISIONAL global-key host mint); host_* event types registered.
- monitor: HostStalenessChecker sibling over host_reports (host_stale/down/
  recovered), wired on the existing 60s ticker; controller checkers unchanged.
- tests (hermetic): store intent/inert-column preservation, auth, ingest
  (envelope+denorm, mismatch/unknown/blocked/oversize), admin mint round-trip,
  host staleness transitions.

CHANGELOG v0.7.0. Contract matches the agent host-report spec field-for-field.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-08 16:36:16 +02:00
parent 0d832def7b
commit 7c0c75457f
12 changed files with 1204 additions and 38 deletions
+229 -9
View File
@@ -89,6 +89,30 @@ func (h *Handler) checkAuthCustomer(r *http.Request) (customerID string, isGloba
return cfg.CustomerID, false, true
}
// checkAuthHost resolves a Bearer token to a HOST identity (the agent's auth
// path). It is a sibling of checkAuthCustomer — the controller path is unchanged.
// - global key -> ("", "", true, true) caller trusts body.host_id
// - per-host key -> (hostID, customerID, false, true)
// - failure -> ("", "", false, false)
func (h *Handler) checkAuthHost(r *http.Request) (hostID, customerID string, isGlobal, ok bool) {
auth := r.Header.Get("Authorization")
if !strings.HasPrefix(auth, "Bearer ") {
return "", "", false, false
}
token := strings.TrimPrefix(auth, "Bearer ")
// Global key first (same constant-time compare as checkAuthCustomer).
if h.apiKey != "" && subtle.ConstantTimeCompare([]byte(token), []byte(h.apiKey)) == 1 {
return "", "", true, true
}
host, err := h.store.GetHostByAPIKey(token)
if err != nil || host == nil {
return "", "", false, false
}
return host.HostID, host.CustomerID, false, true
}
// ServeHTTP routes API requests.
func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
path := strings.TrimPrefix(r.URL.Path, "/api/v1")
@@ -96,6 +120,10 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
switch {
case r.Method == http.MethodPost && path == "/report":
h.handleReport(w, r)
case r.Method == http.MethodPost && path == "/host-report":
h.handleHostReport(w, r)
case r.Method == http.MethodPost && path == "/admin/hosts":
h.handleAdminCreateHost(w, r)
case r.Method == http.MethodPost && path == "/event":
h.handleEvent(w, r)
case r.Method == http.MethodPost && path == "/notify":
@@ -194,6 +222,194 @@ func (h *Handler) handleReport(w http.ResponseWriter, r *http.Request) {
json.NewEncoder(w).Encode(resp)
}
// defaultHostPollSeconds is the cadence the hub hands every agent this slice (no
// per-host override UI yet — that is a later slice).
const defaultHostPollSeconds = 900
// hostReportPayload is the subset of the agent host-report (slice-3 contract,
// §3 / agent spec §4) the hub needs for denorm + guest reality. Unknown fields
// (storage_targets/backups/restore_tests/pbs_snapshots/audit_tail) are ignored,
// so an empty or absent collection is accepted without error.
type hostReportPayload struct {
HostID string `json:"host_id"`
AgentVersion string `json:"agent_version"`
Host struct {
CPUPercent float64 `json:"cpu_percent"`
MemoryPercent float64 `json:"memory_percent"`
DiskPercent float64 `json:"disk_percent"`
} `json:"host"`
Guests []struct {
VMID int `json:"vmid"`
Name string `json:"name"`
Status string `json:"status"`
ControllerVersion string `json:"controller_version"`
} `json:"guests"`
Cloudflared struct {
Status string `json:"status"`
} `json:"cloudflared"`
}
// handleHostReport ingests the agent's host-report (the heartbeat) and returns the
// control envelope (agent spec §5).
func (h *Handler) handleHostReport(w http.ResponseWriter, r *http.Request) {
hostID, custID, isGlobal, ok := h.checkAuthHost(r)
if !ok {
http.Error(w, "Unauthorized", http.StatusUnauthorized)
return
}
// 4 MiB: host reports carry the full guest list + future storage/backup arrays;
// the controller path's 1 MiB is too tight here.
body, err := io.ReadAll(io.LimitReader(r.Body, 4<<20))
if err != nil {
http.Error(w, "Bad request", http.StatusBadRequest)
return
}
var rep hostReportPayload
if err := json.Unmarshal(body, &rep); err != nil || rep.HostID == "" {
http.Error(w, "Invalid payload: host_id required", http.StatusBadRequest)
return
}
if isGlobal {
// Global-key bootstrap: trust body.host_id but require the host to exist
// (it must be minted first) and resolve its customer from the row.
host, err := h.store.GetHost(rep.HostID)
if err != nil {
h.logger.Printf("[ERROR] host lookup failed for %s: %v", rep.HostID, err)
http.Error(w, "Internal error", http.StatusInternalServerError)
return
}
if host == nil {
http.Error(w, "Unknown host_id (mint via /admin/hosts first)", http.StatusBadRequest)
return
}
hostID, custID = rep.HostID, host.CustomerID
} else if rep.HostID != hostID {
http.Error(w, "Forbidden: host_id mismatch", http.StatusForbidden)
return
}
running := 0
for _, g := range rep.Guests {
if g.Status == "running" {
running++
}
}
denorm := store.HostReportDenorm{
AgentVersion: rep.AgentVersion,
CPUPercent: rep.Host.CPUPercent,
MemoryPercent: rep.Host.MemoryPercent,
DiskPercent: rep.Host.DiskPercent,
GuestTotal: len(rep.Guests),
GuestRunning: running,
CloudflaredStatus: rep.Cloudflared.Status,
}
if err := h.store.SaveHostReport(hostID, custID, body, denorm); err != nil {
h.logger.Printf("[ERROR] Failed to save host-report from %s: %v", hostID, err)
http.Error(w, "Internal error", http.StatusInternalServerError)
return
}
for _, g := range rep.Guests {
status := g.Status
if status == "" {
status = "unknown"
}
guest := &store.Guest{
GuestID: store.GuestID(hostID, g.VMID),
CustomerID: custID,
HostID: hostID,
VMID: g.VMID,
DisplayName: g.Name,
Status: status,
ControllerVersion: g.ControllerVersion,
}
if err := h.store.UpsertGuestFromReport(guest); err != nil {
// A guest upsert failure must not drop the whole report (liveness).
h.logger.Printf("[WARN] Failed to upsert guest %s: %v", guest.GuestID, err)
}
}
h.logger.Printf("[INFO] host-report from %s (%d guests, %d bytes)", hostID, len(rep.Guests), len(body))
blocked := false
if cc, err := h.store.GetCustomerConfig(custID); err == nil && cc != nil && cc.Status == "blocked" {
blocked = true
}
resp := map[string]interface{}{
"status": "ok",
"poll_interval_seconds": defaultHostPollSeconds,
"blocked": blocked,
"desired_generation": 0, // reserved (slice 4)
"has_signed_ops": false, // reserved (slice 4)
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
json.NewEncoder(w).Encode(resp)
}
// handleAdminCreateHost mints a host identity (host_id + per-host api_key).
//
// PROVISIONAL (slice-3 bootstrap): global-key only, so the demo agent can
// authenticate before enrollment (slices 78) exists. Enrollment will mint host
// identity + pin signing keys; this endpoint should be removed/locked down then
// (tracked under doc 05 §11 auth-tightening at cutover).
func (h *Handler) handleAdminCreateHost(w http.ResponseWriter, r *http.Request) {
_, _, isGlobal, ok := h.checkAuthHost(r)
if !ok || !isGlobal {
http.Error(w, "Forbidden: global key required", http.StatusForbidden)
return
}
body, err := io.ReadAll(io.LimitReader(r.Body, 1<<20))
if err != nil {
http.Error(w, "Bad request", http.StatusBadRequest)
return
}
var req struct {
CustomerID string `json:"customer_id"`
HostID string `json:"host_id"`
DisplayName string `json:"display_name"`
}
if err := json.Unmarshal(body, &req); err != nil || req.CustomerID == "" {
http.Error(w, "Invalid payload: customer_id required", http.StatusBadRequest)
return
}
cc, err := h.store.GetCustomerConfig(req.CustomerID)
if err != nil {
http.Error(w, "Internal error", http.StatusInternalServerError)
return
}
if cc == nil {
http.Error(w, "Unknown customer_id", http.StatusBadRequest)
return
}
hostID := req.HostID
if hostID == "" {
sfx, err := configgen.RandomHex(3) // 6 hex chars — human-legible for the demo
if err != nil {
http.Error(w, "Internal error", http.StatusInternalServerError)
return
}
hostID = req.CustomerID + "-" + sfx
}
apiKey, err := configgen.RandomHex(32)
if err != nil {
http.Error(w, "Internal error", http.StatusInternalServerError)
return
}
if err := h.store.UpsertHost(&store.Host{HostID: hostID, CustomerID: req.CustomerID, APIKey: apiKey}); err != nil {
h.logger.Printf("[ERROR] Failed to mint host for %s: %v", req.CustomerID, err)
http.Error(w, "Internal error", http.StatusInternalServerError)
return
}
h.logger.Printf("[INFO] provisional host mint: %s (customer %s)", hostID, req.CustomerID)
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusCreated)
json.NewEncoder(w).Encode(map[string]string{"host_id": hostID, "api_key": apiKey})
}
// allowedEventTypes lists all valid event_type values the Hub accepts.
var allowedEventTypes = map[string]bool{
// Controller-pushed events
@@ -219,11 +435,15 @@ var allowedEventTypes = map[string]bool{
"disaster_recovery_started": true,
"disaster_recovery_completed": true,
// Hub-generated events
"node_stale": true,
"node_down": true,
"node_recovered": true,
"expected_backup_missed": true,
"expected_dbdump_missed": true,
"node_stale": true,
"node_down": true,
"node_recovered": true,
// Hub-generated host-domain events (v0.7.0, slice 3)
"host_stale": true,
"host_down": true,
"host_recovered": true,
"expected_backup_missed": true,
"expected_dbdump_missed": true,
// Special
"test": true,
}
@@ -686,10 +906,10 @@ func (h *Handler) handleRecovery(w http.ResponseWriter, r *http.Request, custome
}
resp := struct {
CustomerID string `json:"customer_id"`
ConfigYAML string `json:"config_yaml"`
InfraBackup json.RawMessage `json:"infra_backup"`
HasInfraBackup bool `json:"has_infra_backup"`
CustomerID string `json:"customer_id"`
ConfigYAML string `json:"config_yaml"`
InfraBackup json.RawMessage `json:"infra_backup"`
HasInfraBackup bool `json:"has_infra_backup"`
BackupVersions []store.InfraBackupVersion `json:"backup_versions,omitempty"`
}{
CustomerID: customerID,
+190
View File
@@ -0,0 +1,190 @@
package api
import (
"database/sql"
"encoding/json"
"io"
"log"
"net/http"
"net/http/httptest"
"path/filepath"
"strings"
"testing"
"gitea.dooplex.hu/admin/felhom-hub/internal/store"
_ "modernc.org/sqlite"
)
const globalKey = "GLOBALKEY"
func newTestHandler(t *testing.T) (*Handler, *store.Store, string) {
t.Helper()
path := filepath.Join(t.TempDir(), "test.db")
st, err := store.New(path, log.New(io.Discard, "", 0))
if err != nil {
t.Fatalf("store.New: %v", err)
}
t.Cleanup(func() { st.Close() })
h := New(st, globalKey, "", "", nil, log.New(io.Discard, "", 0))
return h, st, path
}
func do(h *Handler, method, path, bearer, body string) *httptest.ResponseRecorder {
req := httptest.NewRequest(method, "/api/v1"+path, strings.NewReader(body))
if bearer != "" {
req.Header.Set("Authorization", "Bearer "+bearer)
}
rr := httptest.NewRecorder()
h.ServeHTTP(rr, req)
return rr
}
func TestCheckAuthHost(t *testing.T) {
h, st, _ := newTestHandler(t)
st.UpsertHost(&store.Host{HostID: "h1", CustomerID: "c1", APIKey: "HKEY"})
mk := func(bearer string) *http.Request {
req := httptest.NewRequest(http.MethodPost, "/api/v1/host-report", nil)
if bearer != "" {
req.Header.Set("Authorization", "Bearer "+bearer)
}
return req
}
if _, _, isGlobal, ok := h.checkAuthHost(mk(globalKey)); !ok || !isGlobal {
t.Error("global key should resolve isGlobal=true")
}
hostID, custID, isGlobal, ok := h.checkAuthHost(mk("HKEY"))
if !ok || isGlobal || hostID != "h1" || custID != "c1" {
t.Errorf("per-host key = %q/%q global=%v ok=%v", hostID, custID, isGlobal, ok)
}
if _, _, _, ok := h.checkAuthHost(mk("bogus")); ok {
t.Error("unknown key should fail")
}
}
func validReportBody(hostID string) string {
return `{"host_id":"` + hostID + `","agent_version":"0.3.0",` +
`"host":{"cpu_percent":3.2,"memory_percent":25,"disk_percent":19,"loadavg":["0.1"],"uptime_seconds":100},` +
`"guests":[{"vmid":100,"name":"acme","status":"running","controller_version":""},` +
`{"vmid":101,"name":"beta","status":"stopped"}],` +
`"storage_targets":[],"backups":[],"cloudflared":{"status":"active"},"audit_tail":[]}`
}
func TestHandleHostReport_ValidAndEnvelopeAndDenorm(t *testing.T) {
h, st, dbPath := newTestHandler(t)
st.SaveCustomerConfig(&store.CustomerConfig{CustomerID: "c1", APIKey: "ckey", RetrievalPassword: "p"})
st.UpsertHost(&store.Host{HostID: "h1", CustomerID: "c1", APIKey: "HKEY"})
rr := do(h, http.MethodPost, "/host-report", "HKEY", validReportBody("h1"))
if rr.Code != 200 {
t.Fatalf("status = %d, body=%s", rr.Code, rr.Body.String())
}
var env struct {
Status string `json:"status"`
PollIntervalSeconds int `json:"poll_interval_seconds"`
Blocked bool `json:"blocked"`
DesiredGeneration int `json:"desired_generation"`
HasSignedOps bool `json:"has_signed_ops"`
}
json.Unmarshal(rr.Body.Bytes(), &env)
if env.Status != "ok" || env.PollIntervalSeconds != 900 || env.Blocked || env.DesiredGeneration != 0 || env.HasSignedOps {
t.Errorf("envelope = %+v", env)
}
// Denorm: guest_running counts only "running" (1 of 2). Read via a 2nd connection.
db, _ := sql.Open("sqlite", dbPath)
defer db.Close()
var total, running int
var cf string
db.QueryRow(`SELECT guest_total, guest_running, cloudflared_status FROM host_reports WHERE host_id='h1' ORDER BY id DESC LIMIT 1`).
Scan(&total, &running, &cf)
if total != 2 || running != 1 || cf != "active" {
t.Errorf("denorm total=%d running=%d cloudflared=%q (want 2,1,active)", total, running, cf)
}
// Guests upserted.
var gname, gstatus string
if err := db.QueryRow(`SELECT display_name, status FROM guests WHERE guest_id='h1/100'`).Scan(&gname, &gstatus); err != nil {
t.Fatalf("guest h1/100 not upserted: %v", err)
}
if gname != "acme" || gstatus != "running" {
t.Errorf("guest = %q/%q", gname, gstatus)
}
}
func TestHandleHostReport_HostIDMismatch(t *testing.T) {
h, st, _ := newTestHandler(t)
st.UpsertHost(&store.Host{HostID: "h1", CustomerID: "c1", APIKey: "HKEY"})
rr := do(h, http.MethodPost, "/host-report", "HKEY", validReportBody("other-host"))
if rr.Code != http.StatusForbidden {
t.Errorf("status = %d, want 403", rr.Code)
}
}
func TestHandleHostReport_UnknownHostUnderGlobalKey(t *testing.T) {
h, _, _ := newTestHandler(t)
rr := do(h, http.MethodPost, "/host-report", globalKey, validReportBody("ghost"))
if rr.Code != http.StatusBadRequest {
t.Errorf("status = %d, want 400 (unknown host_id)", rr.Code)
}
}
func TestHandleHostReport_BlockedCustomer(t *testing.T) {
h, st, _ := newTestHandler(t)
st.SaveCustomerConfig(&store.CustomerConfig{CustomerID: "c1", APIKey: "ckey", RetrievalPassword: "p"})
st.SetCustomerConfigStatus("c1", "blocked")
st.UpsertHost(&store.Host{HostID: "h1", CustomerID: "c1", APIKey: "HKEY"})
rr := do(h, http.MethodPost, "/host-report", "HKEY", validReportBody("h1"))
if rr.Code != 200 {
t.Fatalf("status = %d", rr.Code)
}
var env struct {
Blocked bool `json:"blocked"`
}
json.Unmarshal(rr.Body.Bytes(), &env)
if !env.Blocked {
t.Error("blocked customer should yield blocked:true")
}
}
func TestHandleHostReport_OversizeRejected(t *testing.T) {
h, st, _ := newTestHandler(t)
st.UpsertHost(&store.Host{HostID: "h1", CustomerID: "c1", APIKey: "HKEY"})
big := `{"host_id":"h1","guests":[{"vmid":1,"name":"` + strings.Repeat("a", 5<<20) + `"}]}`
rr := do(h, http.MethodPost, "/host-report", "HKEY", big)
if rr.Code != http.StatusBadRequest {
t.Errorf("oversize body status = %d, want 400", rr.Code)
}
}
func TestAdminCreateHost(t *testing.T) {
h, st, _ := newTestHandler(t)
st.SaveCustomerConfig(&store.CustomerConfig{CustomerID: "c1", APIKey: "ckey", RetrievalPassword: "p"})
// non-global key (per-customer) → 403
if rr := do(h, http.MethodPost, "/admin/hosts", "ckey", `{"customer_id":"c1"}`); rr.Code != http.StatusForbidden {
t.Errorf("per-customer key status = %d, want 403", rr.Code)
}
// missing/unknown customer → 400
if rr := do(h, http.MethodPost, "/admin/hosts", globalKey, `{"customer_id":"nope"}`); rr.Code != http.StatusBadRequest {
t.Errorf("unknown customer status = %d, want 400", rr.Code)
}
// success → 201 + usable key (round-trip)
rr := do(h, http.MethodPost, "/admin/hosts", globalKey, `{"customer_id":"c1"}`)
if rr.Code != http.StatusCreated {
t.Fatalf("mint status = %d, body=%s", rr.Code, rr.Body.String())
}
var minted struct {
HostID string `json:"host_id"`
APIKey string `json:"api_key"`
}
json.Unmarshal(rr.Body.Bytes(), &minted)
if minted.HostID == "" || minted.APIKey == "" {
t.Fatalf("mint response = %+v", minted)
}
// the minted key authenticates a host-report
rr2 := do(h, http.MethodPost, "/host-report", minted.APIKey, validReportBody(minted.HostID))
if rr2.Code != 200 {
t.Errorf("round-trip host-report with minted key = %d, body=%s", rr2.Code, rr2.Body.String())
}
}