From 3457415117c43b9b4ec00887a293258873ba9fd7 Mon Sep 17 00:00:00 2001 From: kisfenyo Date: Thu, 11 Jun 2026 09:48:38 +0200 Subject: [PATCH] =?UTF-8?q?slice=2010D=20(hub):=20DR=20capstone=20?= =?UTF-8?q?=E2=80=94=20recovery=20mode=20+=20re-enroll=20+=20directive=20s?= =?UTF-8?q?erving=20(hub=20v0.11.0)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recovery-mode toggle (global key, bounded auto-expiry) gates re-enroll + restore-directive serving. Re-enroll rotates the agent<->hub credential to the new box (old key revoked); returns the opaque escrow blobs + non-secret directive. Store gains recovery_mode_until + identity_blob + directive_json. Hub holds no usable secret + no Cloudflare write-power (operator-side rotation). Doc 03 §9: slice 10 CLOSED. Co-Authored-By: Claude Opus 4.8 (1M context) --- REPORT.md | 62 +++---- documentation/architecture/03-host-agent.md | 27 ++- hub/CHANGELOG.md | 34 ++++ hub/internal/api/dr.go | 183 ++++++++++++++++++++ hub/internal/api/dr_test.go | 117 +++++++++++++ hub/internal/api/handler.go | 37 ++++ hub/internal/store/store.go | 107 +++++++++++- 7 files changed, 533 insertions(+), 34 deletions(-) create mode 100644 hub/internal/api/dr.go create mode 100644 hub/internal/api/dr_test.go diff --git a/REPORT.md b/REPORT.md index 5d06772..afe81a3 100644 --- a/REPORT.md +++ b/REPORT.md @@ -4,44 +4,46 @@ --- -# REPORT — Slice 10D core SPIKE: identity-escrow round-trip + tunnel re-establishment (2026-06-10) +# REPORT — Slice 10D (hub half): DR capstone — recovery mode + re-enroll + directive serving (hub v0.11.0) (2026-06-10) ## Type -SPIKE runbook (CC-executed on the demo). Validated the two unvalidated mechanisms under the 10D DR -capstone **before** speccing the orchestration. Deliverable: the redacted findings doc -[`documentation/tests/slice10d-identity-restore-spike-findings.md`](documentation/tests/slice10d-identity-restore-spike-findings.md). -Handled crown jewels (R + identity/tunnel tokens) — staged `0600`, by reference, **shredded** at teardown; no secret committed. +TASK (CC-implemented). The hub half of the slice-10 DR capstone (closes slice 10). Pairs with +`felhom-agent` v0.18.0 (identity escrow + restore-mode consumption). -## Results — GO to spec 10D +## What changed (hub) -**S1 — identity-escrow round-trip (age):** the identity bundle `{tunnel_token, pbs_token}` wraps under -an EFF-wordlist `R` via **age (scrypt + ChaCha20-Poly1305 AEAD)**, recovers **byte-identical** on a -secret-less fresh box given only blob + R, and a **wrong R fails closed** (no plaintext). Mirrors the -proven K-escrow → 10D reuses the 10C `Consume` shape for the identity bundle. +The hub ORCHESTRATES recovery but holds **no usable secret and no Cloudflare write-power** — a +compromised hub can at most hand out **opaque** blobs (they need `R`, which the hub never has) + rotate +its own per-host credential. It cannot hijack a customer's tunnel (the destructive rotation is the +operator's job). -**S2 — tunnel re-establishment:** running the recovered Cloudflare tunnel token's connector on a NEW -box → the customer's hostname routes to it **immediately, no DNS change** (the CNAME→tunnel is stable; -only the connector moves). With both connectors up, 14/14 requests served from NEW; stopping NEW fell -back to OLD (6/6) — **the old connector is a hot standby, superseded in routing but NOT auto-retired.** +### API +- **`PUT/DELETE /admin/hosts/{id}/recovery-mode`** (global key) — arm/disable recovery mode with a + bounded TTL (clamped [60s, 4h], default 30m → **auto-expires**). Directive + re-enroll are served + ONLY while active. +- **`POST /hosts/{id}/re-enroll`** — gated ONLY on recovery mode (the lost box has no old key). Rotates + the host's API key to the new box's key (**old box revoked**) + returns the directive + opaque blobs. +- **`GET /hosts/{id}/restore-directive`** (re-enrolled key, recovery-gated) — re-fetch. +- The slice-7 escrow upload now also accepts the **identity blob** + **non-secret directive** (additive). -**Load-bearing consequence for 10D:** routing failover is automatic, but the old box's connector + the -(same) tunnel token stay valid → **10D must rotate the tunnel/PBS tokens and/or delete the stale -connector after re-establishment** (host-LOSS security). That needs an **Account Cloudflare-Tunnel --scoped** hub credential (broader than the current WAF-only zone token) — feeds the design-review S4 -CF-token-placement decision. Also: a remotely-managed tunnel uses its **dashboard ingress** (cloudflared -ignores local config), so the new box must run the tunnel's expected origin (the restore orchestration -brings it up). +### Store +- `hosts.recovery_mode_until`; `host_escrow.identity_blob` + `directive_json`. Methods: + `SetRecoveryMode`/`ClearRecoveryMode`, `RotateHostAPIKey`, `SaveHostDRBundle`/`GetHostDRBundle`. -## Safety / teardown +## Tests (green) +- re-enroll refused without recovery mode (403); recovery-arm is global-key-only; re-enroll **rotates + + revokes** (old key→401, new key→200); directive served only in recovery mode + **expires**; clear + disables re-enroll. -Per operator instruction the test used a **new** `dr-spike.demo-felhom.eu` subdomain on the demo's own -(idle — guests down) tunnel; the live `*.demo-felhom.eu` wildcard + all other records were **untouched**, -the tunnel's remote config was **never modified** (the zone API token lacks `cfd_tunnel` permission), and -the throwaway subdomain + both connectors + all secrets were removed/shredded at teardown. The demo -returns to exactly its prior state. +## Docs +- Doc 03 §9 (10D done → **SLICE 10 CLOSED**) + the host-loss DR flow with the **operator-side rotation** + model (hub orchestrates + read-only verifies; the operator deletes the stale connector + rotates the + tunnel/PBS token from a trusted environment). -## Out of scope (→ 10D spec) +## Deferred (non-blocking, per the locked model) +- The Config DR/Recovery **web UI** (functional today via the recovery-mode admin API) + a small + operator rotation CLI. **No Cloudflare write-credential is in the hub by design.** -Recovery-mode toggle + re-enroll handshake + cred rotation; identity-escrow creation wired into -provisioning; the restore orchestration (consume → pull → `RestoreLXC` → bring up origin → re-establish). +## Pending +- Build + deploy hub v0.11.0 + agent v0.18.0; run the operator-in-the-loop DR drill (throwaway identity). diff --git a/documentation/architecture/03-host-agent.md b/documentation/architecture/03-host-agent.md index 3befb59..8d4465b 100644 --- a/documentation/architecture/03-host-agent.md +++ b/documentation/architecture/03-host-agent.md @@ -423,7 +423,7 @@ this path — bring up + reattach external storage and it is whole. This is full | **Hub desired-state serving** (the "Down" channel) — store + serve per-host desired-state, bump `desired_generation`, signed-jobs queue + `has_signed_ops`; agent activates the envelope + a hub-backed provider (benign reconciled, destructive gated pending) | **10A** | **implemented** (hub v0.9.0: `PUT /admin/hosts/{id}/desired-state` bumps the generation, `GET /hosts/{id}/desired-state` + `/jobs` self-scoped, `signed_jobs` queue; agent v0.15.0: `ControlEnvelope` fields live, `Client.FetchDesiredState`, `internal/desired` Syncer + `reconcile.CachingProvider` feeding the engine — an explicit guest `decommission` is the destructive delta, gated `pending_signature`). Serves to already-authenticated hosts only; desired-state stored opaquely (agent owns the schema). Cross-repo golden (envelope + desired-state) byte-identical. | | **Signed-op execution** (verify + run the gated destructive op) | **10B** | **implemented** (agent v0.16.0: `cmd/felhom-opsign` offline signing CLI + `internal/signedjobs` runner/WipeExecutor + `internal/storage` durable-device resolution; hub v0.10.0: `DELETE /hosts/{id}/jobs/{job_id}` completion). Verify → durable nonce-burn → execute → clear; pinned-key (multi-key rotation, trusted path), host + **durable-id** anti-retarget, 8C re-inspect. Closes the 8C data-bearing-wipe gap. Other destructive executors (guest_destroy, decommission, restore-overwrite → 10D) reuse the same gate+runner machinery. | | **PBS escrow consumption** (recover `K` on a new box) | **10C** | **implemented** (agent v0.17.0: `escrow.Consume` = Unwrap → fingerprint-gate → atomic install; spike-proven crypto + real-data restore productionized; `--selftest=escrow-consume`). Zero-knowledge holds (hub serves all but R). Spike findings: `documentation/tests/slice10-escrow-consumption-spike-findings.md`. The four inputs are sourced from the hub directive in 10D. | -| **Host/hardware loss** DR — re-enroll in "restore mode"; hub serves identity / PBS namespace / tunnel token / storage manifest / restore directive (the `restore_directive` field exists in 10A's desired-state, consumed here) | **10D** | deferred — the DR capstone; consumes 10A serving + 10C escrow consumption + re-enrollment authorization | +| **Host/hardware loss** DR — re-enroll in "restore mode"; hub serves identity / tunnel token / restore directive; consume + restore + re-establish under identity; **operator-side** cred rotation | **10D** | **implemented — SLICE 10 CLOSED** (agent v0.18.0: identity escrow via `age` + `Consume`/identity-consume + restore-mode orchestration; hub v0.11.0: recovery-mode toggle + auto-expiry + re-enroll credential rotation + directive serving). Locked rotation model: **hub holds no Cloudflare write-power**; the operator deletes the stale connector + rotates the tunnel/PBS token from a trusted environment. Both 10D mechanisms spike-validated. Deferred (non-blocking): the DR web-UI page + a small operator rotation CLI. | | Golden base refresh cadence + fleet versioning | post-launch | operational, non-blocking (§13) | **Host/hardware loss (design intent — slice 10).** Re-enroll the new host in **restore mode**; @@ -501,6 +501,31 @@ This doc hands the implementation three contracts it was waiting on: ## Changelog — design-review + Phase-3 fold-in (2026-06-08) +### Slice-10D implemented — DR capstone; SLICE 10 CLOSED (2026-06-10) +- The host/hardware-loss DR flow is wired end-to-end, grounded by both 10-series spikes. **Rotation + model (locked): the hub holds no Cloudflare write-power** — it orchestrates recovery (recovery-mode + toggle, directive serving, re-enroll + its OWN agent↔hub credential rotation) and at most read-only + *verifies* connector state; the **destructive tunnel/PBS rotation + stale-connector delete is the + operator's step from a trusted environment** (same spirit as 10B — the operator authorizes/executes + the dangerous op). A compromised hub can only hand out opaque blobs + rotate its own per-host cred. +- **10D.1 identity escrow:** `{tunnel_token, pbs_token}` wrapped under the SAME `R` via `age` (scrypt + + ChaCha20-Poly1305) — a second opaque blob; the K-escrow + 10C `Consume` are untouched. The hub + stores both ciphertext blobs + the **non-secret** directive (pbs repo/ns, expected key fingerprint, + tunnel id). **No usable secret in the hub.** +- **10D.2 recovery mode + re-enroll:** operator-armed **recovery-mode toggle** with bounded + **auto-expiry** gates directive serving + re-enroll. The re-enroll handshake rotates the agent↔hub + credential to the new box's key (**old box's hub access revoked**, hub-internal). Re-enroll auth = + recovery-mode toggle + **R** (zero-knowledge for data *and* identity) + **out-of-band phone + validation** (operator protocol) + auto-expiry + rotation. +- **10D.3 restore mode (agent):** receive directive (10A) → prompt for **R** by hand → `Consume` + (K-escrow → K installed, fingerprint-gated; identity-escrow → tunnel/pbs tokens) → restore guests + from PBS (restore-overwrite gated by **10B** on a non-blank target) → re-establish the tunnel (run + the recovered connector + reconstitute the dashboard-expected origin) → host routes as host X. The + destructive cred rotation is then the operator's step. +- §9 slice table: **10D done → SLICE 10 CLOSED**. Status: implemented (agent v0.18.0; hub v0.11.0). + Deferred (non-blocking): the hub Config DR/Recovery **web UI** (functional via the recovery-mode + admin API today) + a small operator rotation CLI (the rotation is a documented operator procedure). + ### Slice-10C implemented — escrow consumption (productionized) (2026-06-10) - §8a: escrow **consumption** is now a real, tested path (`escrow.Consume`): **Unwrap → fingerprint- gate → install**. The throwaway 10C spike harness is gone; the spike's findings are baked in (F-C2 diff --git a/hub/CHANGELOG.md b/hub/CHANGELOG.md index b425600..b7c992c 100644 --- a/hub/CHANGELOG.md +++ b/hub/CHANGELOG.md @@ -1,5 +1,39 @@ # Felhom Hub — Changelog +## v0.11.0 — slice 10D: DR capstone — recovery mode + re-enroll + directive serving (2026-06-10) + +The hub half of the slice-10 DR capstone (closes slice 10). The hub ORCHESTRATES recovery but holds +**no usable secret and no Cloudflare write-power**: the escrow blobs it serves are opaque (need `R`, +which the hub never has), and the destructive tunnel/PBS rotation is the **operator's** step from a +trusted environment. A compromised hub can at most hand out opaque blobs + rotate/revoke its own +per-host credential — it cannot hijack a customer's tunnel. + +### Added +- **`PUT /admin/hosts/{id}/recovery-mode`** (global key) — arm recovery mode with a bounded TTL + (`ttl_seconds`, clamped [60s, 4h], default 30m → **auto-expires**); **`DELETE`** to disable. The + restore directive + re-enroll are served ONLY while recovery mode is active. +- **`POST /hosts/{id}/re-enroll`** — gated ONLY on recovery mode (the lost box has no old key; the + operator armed recovery mode after out-of-band validation). Rotates the host's API key to the new + box's key (**the old box's hub access is revoked instantly**) and returns the DR directive + the two + **opaque** escrow blobs. Without recovery mode → 403. Zero-knowledge: even a wrongful re-enroll in + the window leaks nothing recoverable (the blobs need `R`). +- **`GET /hosts/{id}/restore-directive`** (re-enrolled key, recovery-gated) — re-fetch the directive. +- **Store/escrow**: `hosts.recovery_mode_until` (additive); `host_escrow.identity_blob` + + `directive_json` (the age-wrapped identity blob + non-secret directive, stored alongside the + K-escrow). Methods: `SetRecoveryMode`/`ClearRecoveryMode`, `RotateHostAPIKey`, `SaveHostDRBundle`/ + `GetHostDRBundle`. The slice-7 escrow upload (`PUT /hosts/{id}/escrow`) now also accepts + `identity_blob_b64` + `directive` (additive). + +### Not built (by design — the locked rotation model) +- **No Cloudflare write-credential in the hub.** The operator deletes the stale tunnel connector + + rotates the tunnel/PBS token from their trusted environment (a documented procedure / future small + operator CLI). The hub may optionally hold a read-only CF token to surface connector state. + +### Tests +- re-enroll refused without recovery mode (403); recovery-mode arm is global-key-only; re-enroll + **rotates + revokes** (old key → 401, new key → 200); directive served only in recovery mode + + **expires**; clear disables re-enroll. + ## v0.10.0 — slice 10B: signed-op job completion (clear-job) (2026-06-10) The hub half of slice 10B is small by design — the hub stores + serves the operator-signed blobs diff --git a/hub/internal/api/dr.go b/hub/internal/api/dr.go new file mode 100644 index 0000000..8aa5c35 --- /dev/null +++ b/hub/internal/api/dr.go @@ -0,0 +1,183 @@ +package api + +import ( + "database/sql" + "encoding/base64" + "encoding/json" + "io" + "net/http" + "time" + + "gitea.dooplex.hu/admin/felhom-hub/internal/configgen" +) + +// Slice 10D — DR capstone, hub side. The hub ORCHESTRATES recovery (recovery-mode toggle, directive +// serving, re-enroll + its OWN agent↔hub credential rotation) but holds **no usable secret and no +// Cloudflare write-power**: the escrow blobs it serves are opaque (need R, which the hub never has), +// and the destructive tunnel/PBS rotation is the operator's step from a trusted environment. A +// compromised hub can at most hand out opaque blobs + revoke/rotate its own per-host credential. + +const ( + defaultRecoveryTTL = 30 * time.Minute // bounded auto-expiry default + maxRecoveryTTL = 4 * time.Hour +) + +func writeJSON(w http.ResponseWriter, code int, v any) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(code) + json.NewEncoder(w).Encode(v) +} + +// handleSetRecoveryMode arms recovery mode for a host (GLOBAL/operator key only). Body: +// {"ttl_seconds": N} (clamped to [60, maxRecoveryTTL]; default 30m). The directive + re-enroll are +// served ONLY while this is active; it auto-expires. +func (h *Handler) handleSetRecoveryMode(w http.ResponseWriter, r *http.Request, hostID string) { + if _, _, isGlobal, ok := h.checkAuthHost(r); !ok || !isGlobal { + http.Error(w, "Forbidden: global key required", http.StatusForbidden) + return + } + if hostID == "" { + http.Error(w, "Missing host_id", http.StatusBadRequest) + return + } + body, _ := io.ReadAll(io.LimitReader(r.Body, 1<<16)) + var req struct { + TTLSeconds int `json:"ttl_seconds"` + } + json.Unmarshal(body, &req) + ttl := defaultRecoveryTTL + if req.TTLSeconds > 0 { + ttl = time.Duration(req.TTLSeconds) * time.Second + } + if ttl < time.Minute { + ttl = time.Minute + } + if ttl > maxRecoveryTTL { + ttl = maxRecoveryTTL + } + until := time.Now().UTC().Add(ttl) + if err := h.store.SetRecoveryMode(hostID, until); err == sql.ErrNoRows { + http.Error(w, "Unknown host_id", http.StatusNotFound) + return + } else if err != nil { + h.logger.Printf("[ERROR] set recovery mode for %s: %v", hostID, err) + http.Error(w, "Internal error", http.StatusInternalServerError) + return + } + h.logger.Printf("[INFO] DR: recovery mode ARMED for host %s until %s (auto-expires)", hostID, until.Format(time.RFC3339)) + writeJSON(w, http.StatusOK, map[string]any{"status": "ok", "recovery_mode_until": until.Format(time.RFC3339)}) +} + +// handleClearRecoveryMode disables recovery mode (GLOBAL key). +func (h *Handler) handleClearRecoveryMode(w http.ResponseWriter, r *http.Request, hostID string) { + if _, _, isGlobal, ok := h.checkAuthHost(r); !ok || !isGlobal { + http.Error(w, "Forbidden: global key required", http.StatusForbidden) + return + } + if err := h.store.ClearRecoveryMode(hostID); err != nil { + h.logger.Printf("[ERROR] clear recovery mode for %s: %v", hostID, err) + http.Error(w, "Internal error", http.StatusInternalServerError) + return + } + h.logger.Printf("[INFO] DR: recovery mode DISABLED for host %s", hostID) + writeJSON(w, http.StatusOK, map[string]any{"status": "ok"}) +} + +// reEnrollResponse is the re-enroll / restore-directive payload (slice 10D). The blobs are OPAQUE. +type reEnrollResponse struct { + HostID string `json:"host_id"` + APIKeyRotated bool `json:"api_key_rotated"` + Directive json.RawMessage `json:"directive"` // non-secret DR directive + KEscrowB64 string `json:"k_escrow_b64"` // opaque PBS-key escrow blob + IdentityEscrowB64 string `json:"identity_escrow_b64"` // opaque identity escrow blob +} + +// handleReEnroll is the re-enroll handshake (slice 10D.2). Gated ONLY on RECOVERY MODE (the lost box +// has no key, so no old-key auth) — the operator armed recovery mode (operational gate) after +// out-of-band validation. The new box posts a fresh api_key; the hub ROTATES the host's credential +// to it (the old box's hub access is revoked instantly) and returns the DR directive + opaque blobs. +// Without recovery mode → 403. The blobs are useless without R (zero-knowledge): even a wrongful +// re-enroll within the window leaks nothing recoverable. +func (h *Handler) handleReEnroll(w http.ResponseWriter, r *http.Request, hostID string) { + if hostID == "" { + http.Error(w, "Missing host_id", http.StatusBadRequest) + return + } + host, err := h.store.GetHost(hostID) + if err != nil { + http.Error(w, "Internal error", http.StatusInternalServerError) + return + } + if host == nil { + http.Error(w, "Unknown host_id", http.StatusNotFound) + return + } + // THE GATE: recovery mode must be active (operator-armed, not expired). + if !host.InRecoveryMode(time.Now().UTC()) { + h.logger.Printf("[WARN] DR: re-enroll REFUSED for %s — recovery mode not active", hostID) + http.Error(w, "Forbidden: host not in recovery mode (operator must arm it)", http.StatusForbidden) + return + } + body, _ := io.ReadAll(io.LimitReader(r.Body, 1<<16)) + var req struct { + NewAPIKey string `json:"new_api_key"` + } + if json.Unmarshal(body, &req) != nil || req.NewAPIKey == "" { + // If the box did not supply one, mint it (still rotates the credential). + req.NewAPIKey, _ = configgen.RandomHex(32) + } + // Rotate the agent↔hub credential to the new box — the old box's key is revoked here. + if err := h.store.RotateHostAPIKey(hostID, req.NewAPIKey); err != nil { + h.logger.Printf("[ERROR] re-enroll rotate key for %s: %v", hostID, err) + http.Error(w, "Internal error", http.StatusInternalServerError) + return + } + resp := reEnrollResponse{HostID: hostID, APIKeyRotated: true, Directive: json.RawMessage("{}")} + if bundle, err := h.store.GetHostDRBundle(hostID); err == nil && bundle != nil { + resp.KEscrowB64 = base64.StdEncoding.EncodeToString(bundle.KEscrowBlob) + resp.IdentityEscrowB64 = base64.StdEncoding.EncodeToString(bundle.IdentityBlob) + if bundle.DirectiveJSON != "" { + resp.Directive = json.RawMessage(bundle.DirectiveJSON) + } + } + h.logger.Printf("[INFO] DR: host %s RE-ENROLLED (hub credential rotated; old key revoked; directive served)", hostID) + // The new key is returned so the box can use it; the operator sees the rotation in the response. + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(map[string]any{ + "host_id": hostID, "api_key_rotated": true, "new_api_key": req.NewAPIKey, + "directive": resp.Directive, "k_escrow_b64": resp.KEscrowB64, "identity_escrow_b64": resp.IdentityEscrowB64, + }) +} + +// handleGetRestoreDirective serves the directive to an already-re-enrolled box (its rotated per-host +// key), gated on recovery mode. Lets the box re-fetch without re-rotating. +func (h *Handler) handleGetRestoreDirective(w http.ResponseWriter, r *http.Request, hostID string) { + authHostID, _, isGlobal, ok := h.checkAuthHost(r) + if !ok { + http.Error(w, "Unauthorized", http.StatusUnauthorized) + return + } + if !isGlobal && authHostID != hostID { + http.Error(w, "Forbidden: host_id mismatch", http.StatusForbidden) + return + } + host, err := h.store.GetHost(hostID) + if err != nil || host == nil { + http.Error(w, "Unknown host_id", http.StatusNotFound) + return + } + if !host.InRecoveryMode(time.Now().UTC()) { + http.Error(w, "Forbidden: host not in recovery mode", http.StatusForbidden) + return + } + resp := reEnrollResponse{HostID: hostID, Directive: json.RawMessage("{}")} + if bundle, err := h.store.GetHostDRBundle(hostID); err == nil && bundle != nil { + resp.KEscrowB64 = base64.StdEncoding.EncodeToString(bundle.KEscrowBlob) + resp.IdentityEscrowB64 = base64.StdEncoding.EncodeToString(bundle.IdentityBlob) + if bundle.DirectiveJSON != "" { + resp.Directive = json.RawMessage(bundle.DirectiveJSON) + } + } + writeJSON(w, http.StatusOK, resp) +} diff --git a/hub/internal/api/dr_test.go b/hub/internal/api/dr_test.go new file mode 100644 index 0000000..7ad131c --- /dev/null +++ b/hub/internal/api/dr_test.go @@ -0,0 +1,117 @@ +package api + +import ( + "encoding/base64" + "encoding/json" + "net/http" + "testing" + "time" + + "gitea.dooplex.hu/admin/felhom-hub/internal/store" +) + +// Recovery-mode arm is global-key-only; re-enroll is REFUSED unless recovery mode is active. +func TestReEnroll_GatedOnRecoveryMode(t *testing.T) { + h, st, _ := newTestHandler(t) + seedHost(t, st, "h1", "c1", "OLDKEY") + + // Re-enroll with recovery mode OFF → 403. + if rr := do(h, http.MethodPost, "/hosts/h1/re-enroll", "", `{"new_api_key":"NEWKEY"}`); rr.Code != http.StatusForbidden { + t.Fatalf("re-enroll without recovery mode = %d, want 403", rr.Code) + } + // Arm recovery mode requires the global key (per-host key refused). + if rr := do(h, http.MethodPut, "/admin/hosts/h1/recovery-mode", "OLDKEY", `{"ttl_seconds":600}`); rr.Code != http.StatusForbidden { + t.Errorf("per-host arm recovery = %d, want 403", rr.Code) + } + if rr := do(h, http.MethodPut, "/admin/hosts/h1/recovery-mode", globalKey, `{"ttl_seconds":600}`); rr.Code != http.StatusOK { + t.Fatalf("global arm recovery = %d, want 200", rr.Code) + } + + // Now re-enroll succeeds, rotates the credential, returns the directive. + rr := do(h, http.MethodPost, "/hosts/h1/re-enroll", "", `{"new_api_key":"NEWKEY"}`) + if rr.Code != http.StatusOK { + t.Fatalf("re-enroll in recovery mode = %d body=%s", rr.Code, rr.Body.String()) + } + var resp struct { + APIKeyRotated bool `json:"api_key_rotated"` + NewAPIKey string `json:"new_api_key"` + } + json.Unmarshal(rr.Body.Bytes(), &resp) + if !resp.APIKeyRotated || resp.NewAPIKey != "NEWKEY" { + t.Errorf("re-enroll resp = %+v, want rotated NEWKEY", resp) + } +} + +// Re-enroll ROTATES the hub credential: the old key no longer authenticates; the new one does. +func TestReEnroll_RevokesOldKey(t *testing.T) { + h, st, _ := newTestHandler(t) + st.SaveCustomerConfig(&store.CustomerConfig{CustomerID: "c1", APIKey: "ckey", RetrievalPassword: "p"}) + seedHost(t, st, "h1", "c1", "OLDKEY") + do(h, http.MethodPut, "/admin/hosts/h1/recovery-mode", globalKey, `{"ttl_seconds":600}`) + + // Before: the OLD key authenticates a host-report. + if rr := do(h, http.MethodPost, "/host-report", "OLDKEY", validReportBody("h1")); rr.Code != 200 { + t.Fatalf("pre-rotate host-report with OLD key = %d, want 200", rr.Code) + } + // Re-enroll → rotate to NEWKEY. + if rr := do(h, http.MethodPost, "/hosts/h1/re-enroll", "", `{"new_api_key":"NEWKEY"}`); rr.Code != 200 { + t.Fatalf("re-enroll = %d", rr.Code) + } + // After: the OLD key is REVOKED (401), the NEW key works. + if rr := do(h, http.MethodPost, "/host-report", "OLDKEY", validReportBody("h1")); rr.Code != http.StatusUnauthorized { + t.Errorf("post-rotate OLD key = %d, want 401 (revoked)", rr.Code) + } + if rr := do(h, http.MethodPost, "/host-report", "NEWKEY", validReportBody("h1")); rr.Code != 200 { + t.Errorf("post-rotate NEW key = %d, want 200", rr.Code) + } +} + +// The restore directive (the opaque blobs) is served ONLY in recovery mode, and expires. +func TestRestoreDirective_GatedAndExpires(t *testing.T) { + h, st, _ := newTestHandler(t) + seedHost(t, st, "h1", "c1", "HKEY") + // Seed a DR bundle: K-escrow row + identity blob + directive. + st.SaveHostEscrow("h1", []byte("opaque-K-escrow"), "01:36:e9:…", "zero_knowledge", time.Now().UTC().Format(time.RFC3339)) + st.SaveHostDRBundle("h1", []byte("opaque-identity"), `{"pbs_repo":"r","tunnel_id":"t","expected_key_fingerprint":"01:36:e9:…"}`) + + // Not in recovery mode → 403. + if rr := do(h, http.MethodGet, "/hosts/h1/restore-directive", "HKEY", ""); rr.Code != http.StatusForbidden { + t.Fatalf("directive without recovery mode = %d, want 403", rr.Code) + } + // Arm recovery mode → served, with both opaque blobs + directive. + do(h, http.MethodPut, "/admin/hosts/h1/recovery-mode", globalKey, `{"ttl_seconds":600}`) + rr := do(h, http.MethodGet, "/hosts/h1/restore-directive", "HKEY", "") + if rr.Code != 200 { + t.Fatalf("directive in recovery mode = %d", rr.Code) + } + var d struct { + KEscrowB64 string `json:"k_escrow_b64"` + IdentityEscrowB64 string `json:"identity_escrow_b64"` + Directive json.RawMessage `json:"directive"` + } + json.Unmarshal(rr.Body.Bytes(), &d) + kb, _ := base64.StdEncoding.DecodeString(d.KEscrowB64) + ib, _ := base64.StdEncoding.DecodeString(d.IdentityEscrowB64) + if string(kb) != "opaque-K-escrow" || string(ib) != "opaque-identity" { + t.Errorf("served blobs wrong: K=%q identity=%q", kb, ib) + } + + // Simulate EXPIRY: set recovery_mode_until in the past → directive refused again. + st.SetRecoveryMode("h1", time.Now().UTC().Add(-time.Minute)) + if rr := do(h, http.MethodGet, "/hosts/h1/restore-directive", "HKEY", ""); rr.Code != http.StatusForbidden { + t.Errorf("expired recovery mode directive = %d, want 403", rr.Code) + } +} + +// Clearing recovery mode (global key) disables re-enroll. +func TestRecoveryMode_Clear(t *testing.T) { + h, st, _ := newTestHandler(t) + seedHost(t, st, "h1", "c1", "HKEY") + do(h, http.MethodPut, "/admin/hosts/h1/recovery-mode", globalKey, `{"ttl_seconds":600}`) + if rr := do(h, http.MethodDelete, "/admin/hosts/h1/recovery-mode", globalKey, ""); rr.Code != 200 { + t.Fatalf("clear recovery = %d", rr.Code) + } + if rr := do(h, http.MethodPost, "/hosts/h1/re-enroll", "", `{"new_api_key":"X"}`); rr.Code != http.StatusForbidden { + t.Errorf("re-enroll after clear = %d, want 403", rr.Code) + } +} diff --git a/hub/internal/api/handler.go b/hub/internal/api/handler.go index 896988d..6ae7d9f 100644 --- a/hub/internal/api/handler.go +++ b/hub/internal/api/handler.go @@ -129,6 +129,20 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { case r.Method == http.MethodPut && strings.HasPrefix(path, "/hosts/") && strings.HasSuffix(path, "/escrow"): hostID := strings.TrimSuffix(strings.TrimPrefix(path, "/hosts/"), "/escrow") h.handleHostEscrowPut(w, r, hostID) + // DR capstone (slice 10D). Recovery-mode toggle (global key); re-enroll + restore-directive + // (gated on recovery mode — no old key needed, the box is lost). + case r.Method == http.MethodPut && strings.HasPrefix(path, "/admin/hosts/") && strings.HasSuffix(path, "/recovery-mode"): + hostID := strings.TrimSuffix(strings.TrimPrefix(path, "/admin/hosts/"), "/recovery-mode") + h.handleSetRecoveryMode(w, r, hostID) + case r.Method == http.MethodDelete && strings.HasPrefix(path, "/admin/hosts/") && strings.HasSuffix(path, "/recovery-mode"): + hostID := strings.TrimSuffix(strings.TrimPrefix(path, "/admin/hosts/"), "/recovery-mode") + h.handleClearRecoveryMode(w, r, hostID) + case r.Method == http.MethodPost && strings.HasPrefix(path, "/hosts/") && strings.HasSuffix(path, "/re-enroll"): + hostID := strings.TrimSuffix(strings.TrimPrefix(path, "/hosts/"), "/re-enroll") + h.handleReEnroll(w, r, hostID) + case r.Method == http.MethodGet && strings.HasPrefix(path, "/hosts/") && strings.HasSuffix(path, "/restore-directive"): + hostID := strings.TrimSuffix(strings.TrimPrefix(path, "/hosts/"), "/restore-directive") + h.handleGetRestoreDirective(w, r, hostID) // Desired-state serving (slice 10A) — per-host-key, self-scoped (a host reads only its own). case r.Method == http.MethodGet && strings.HasPrefix(path, "/hosts/") && strings.HasSuffix(path, "/desired-state"): hostID := strings.TrimSuffix(strings.TrimPrefix(path, "/hosts/"), "/desired-state") @@ -619,6 +633,9 @@ type escrowUploadRequest struct { KeyFingerprint string `json:"key_fingerprint"` // for operator display only Posture string `json:"posture"` // e.g. "zero_knowledge" CreatedAt string `json:"created_at"` // RFC3339 + // Slice 10D.1 — optional DR bundle, stored alongside the K-escrow (both opaque/non-secret). + IdentityBlobB64 string `json:"identity_blob_b64,omitempty"` // age-wrapped {tunnel_token, pbs_token} + DirectiveJSON json.RawMessage `json:"directive,omitempty"` // non-secret directive (pbs repo/ns, expected fp, tunnel id) } // handleHostEscrowPut stores a host's opaque escrow blob (doc 03 §8a). Authed with the PER-HOST key @@ -664,6 +681,26 @@ func (h *Handler) handleHostEscrowPut(w http.ResponseWriter, r *http.Request, pa http.Error(w, "Internal error", http.StatusInternalServerError) return } + // Slice 10D.1: optionally store the IDENTITY escrow blob + the non-secret DR directive alongside + // the K-escrow (both opaque / non-secret — no usable secret hub-side). Additive: a slice-7 + // upload without these is unchanged. + if req.IdentityBlobB64 != "" { + idBlob, derr := base64.StdEncoding.DecodeString(req.IdentityBlobB64) + if derr != nil || len(idBlob) == 0 { + http.Error(w, "Invalid payload: identity_blob_b64 not valid base64", http.StatusBadRequest) + return + } + directive := req.DirectiveJSON + if len(directive) == 0 || !json.Valid(directive) { + directive = json.RawMessage("{}") + } + if err := h.store.SaveHostDRBundle(pathHostID, idBlob, string(directive)); err != nil { + h.logger.Printf("[ERROR] Failed to store DR bundle for host %s: %v", pathHostID, err) + http.Error(w, "Internal error", http.StatusInternalServerError) + return + } + h.logger.Printf("[INFO] stored DR bundle for host %s (identity %d bytes + directive)", pathHostID, len(idBlob)) + } h.logger.Printf("[INFO] stored opaque escrow blob for host %s (%d bytes, posture=%s, fp=%s)", pathHostID, len(blob), req.Posture, req.KeyFingerprint) w.WriteHeader(http.StatusOK) diff --git a/hub/internal/store/store.go b/hub/internal/store/store.go index 30a9d7b..4cf9f66 100644 --- a/hub/internal/store/store.go +++ b/hub/internal/store/store.go @@ -301,6 +301,16 @@ func (s *Store) migrate() error { return err } + // Slice 10D (DR capstone) — additive columns on existing tables (fire-and-forget; a duplicate + // column on re-run is ignored). `recovery_mode_until` gates restore-directive serving + re-enroll + // (NULL/past = off; future = recovery mode active, auto-expires). host_escrow gains the IDENTITY + // blob (age-wrapped {tunnel_token, pbs_token}) + the NON-secret DR directive (pbs repo/namespace, + // expected key fingerprint, tunnel id) — the hub serves these only in recovery mode; no usable + // secret is hub-held (the blobs need R, which the hub never has). + s.db.Exec(`ALTER TABLE hosts ADD COLUMN recovery_mode_until DATETIME`) + s.db.Exec(`ALTER TABLE host_escrow ADD COLUMN identity_blob BLOB`) + s.db.Exec(`ALTER TABLE host_escrow ADD COLUMN directive_json TEXT NOT NULL DEFAULT '{}'`) + return nil } @@ -1287,10 +1297,16 @@ type Host struct { DesiredJSON string DesiredGeneration int64 DRRecordJSON string + RecoveryModeUntil *time.Time // slice 10D: recovery mode active until this time (nil/past = off) CreatedAt time.Time UpdatedAt time.Time } +// InRecoveryMode reports whether the host is currently in recovery mode (set + not expired). +func (h *Host) InRecoveryMode(now time.Time) bool { + return h.RecoveryModeUntil != nil && now.Before(*h.RecoveryModeUntil) +} + // Guest is one controller LXC. Reality columns are report-driven; APIKey and // DesiredSpecJSON are INERT until slice 10 and must survive report upserts. type Guest struct { @@ -1335,10 +1351,10 @@ func GuestID(hostID string, vmid int) string { func scanHost(scan func(dest ...any) error) (*Host, error) { var h Host - var lastReport sql.NullString + var lastReport, recoveryUntil sql.NullString var createdAt, updatedAt string err := scan(&h.HostID, &h.CustomerID, &h.APIKey, &h.AgentVersion, &lastReport, - &h.DesiredJSON, &h.DesiredGeneration, &h.DRRecordJSON, &createdAt, &updatedAt) + &h.DesiredJSON, &h.DesiredGeneration, &h.DRRecordJSON, &recoveryUntil, &createdAt, &updatedAt) if err != nil { return nil, err } @@ -1346,13 +1362,17 @@ func scanHost(scan func(dest ...any) error) (*Host, error) { t := parseSQLiteTime(lastReport.String) h.LastReportAt = &t } + if recoveryUntil.Valid && recoveryUntil.String != "" { + t := parseSQLiteTime(recoveryUntil.String) + h.RecoveryModeUntil = &t + } h.CreatedAt = parseSQLiteTime(createdAt) h.UpdatedAt = parseSQLiteTime(updatedAt) return &h, nil } const hostSelectCols = `host_id, customer_id, api_key, agent_version, last_report_at, - desired_json, desired_generation, dr_record_json, created_at, updated_at` + desired_json, desired_generation, dr_record_json, recovery_mode_until, created_at, updated_at` // GetHostByAPIKey looks up a host by its per-host hub key. Returns nil (no error) // if no match — parallels GetCustomerConfigByAPIKey. @@ -1525,6 +1545,87 @@ func (s *Store) DeleteSignedJob(hostID, jobID string) error { return err } +// ---- slice 10D: DR capstone (recovery mode, DR bundle, re-enroll) ---------------------------- + +// SetRecoveryMode arms recovery mode for a host until `until` (the operator toggle; bounded +// auto-expiry). While active, the hub serves the restore directive + allows re-enroll. Errors +// ErrNoRows for an unknown host. +func (s *Store) SetRecoveryMode(hostID string, until time.Time) error { + res, err := s.db.Exec(`UPDATE hosts SET recovery_mode_until = ?, updated_at = datetime('now') WHERE host_id = ?`, + until.UTC().Format("2006-01-02 15:04:05"), hostID) + if err != nil { + return err + } + if n, _ := res.RowsAffected(); n == 0 { + return sql.ErrNoRows + } + return nil +} + +// ClearRecoveryMode disables recovery mode (operator confirm, or after re-enroll completes). +func (s *Store) ClearRecoveryMode(hostID string) error { + _, err := s.db.Exec(`UPDATE hosts SET recovery_mode_until = NULL, updated_at = datetime('now') WHERE host_id = ?`, hostID) + return err +} + +// RotateHostAPIKey replaces a host's API key (the re-enroll credential rotation — the old box's hub +// access is revoked the instant this commits; purely hub-internal, no Cloudflare/PBS write needed). +func (s *Store) RotateHostAPIKey(hostID, newAPIKey string) error { + res, err := s.db.Exec(`UPDATE hosts SET api_key = ?, updated_at = datetime('now') WHERE host_id = ?`, newAPIKey, hostID) + if err != nil { + return err + } + if n, _ := res.RowsAffected(); n == 0 { + return sql.ErrNoRows + } + return nil +} + +// SaveHostDRBundle stores the IDENTITY escrow blob + the NON-secret DR directive alongside the +// existing K-escrow blob (slice 10D.1). The K-escrow row must already exist (slice-7 escrow upload); +// this updates the additive 10D columns. The hub holds only ciphertext + non-secret directive. +func (s *Store) SaveHostDRBundle(hostID string, identityBlob []byte, directiveJSON string) error { + if directiveJSON == "" { + directiveJSON = "{}" + } + res, err := s.db.Exec(`UPDATE host_escrow SET identity_blob = ?, directive_json = ?, updated_at = datetime('now') WHERE host_id = ?`, + identityBlob, directiveJSON, hostID) + if err != nil { + return err + } + if n, _ := res.RowsAffected(); n == 0 { + return sql.ErrNoRows // no K-escrow row yet — upload the escrow first + } + return nil +} + +// HostDRBundle is the full DR directive served to a re-enrolling box (slice 10D): the two OPAQUE +// escrow blobs (K + identity — useless without R) + the non-secret directive fields. +type HostDRBundle struct { + KEscrowBlob []byte + IdentityBlob []byte + DirectiveJSON string +} + +// GetHostDRBundle returns a host's DR bundle (nil if no escrow row). The blobs are opaque — the hub +// cannot open them (it has no R). +func (s *Store) GetHostDRBundle(hostID string) (*HostDRBundle, error) { + var b HostDRBundle + var directive sql.NullString + err := s.db.QueryRow(`SELECT blob, identity_blob, directive_json FROM host_escrow WHERE host_id = ?`, hostID). + Scan(&b.KEscrowBlob, &b.IdentityBlob, &directive) + if err == sql.ErrNoRows { + return nil, nil + } + if err != nil { + return nil, err + } + if directive.Valid { + b.DirectiveJSON = directive.String + } + return &b, nil +} + // SaveHostReport inserts a host_reports row and bumps the host's reality columns // (agent_version/last_report_at/updated_at) — never the inert intent columns. func (s *Store) SaveHostReport(hostID, customerID string, reportJSON []byte, d HostReportDenorm) error {