slice 10D (hub): DR capstone — recovery mode + re-enroll + directive serving (hub v0.11.0)

Recovery-mode toggle (global key, bounded auto-expiry) gates re-enroll +
restore-directive serving. Re-enroll rotates the agent<->hub credential to the
new box (old key revoked); returns the opaque escrow blobs + non-secret
directive. Store gains recovery_mode_until + identity_blob + directive_json.
Hub holds no usable secret + no Cloudflare write-power (operator-side rotation).
Doc 03 §9: slice 10 CLOSED.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-11 09:48:38 +02:00
parent a22b87e6e3
commit 3457415117
7 changed files with 533 additions and 34 deletions
+104 -3
View File
@@ -301,6 +301,16 @@ func (s *Store) migrate() error {
return err
}
// Slice 10D (DR capstone) — additive columns on existing tables (fire-and-forget; a duplicate
// column on re-run is ignored). `recovery_mode_until` gates restore-directive serving + re-enroll
// (NULL/past = off; future = recovery mode active, auto-expires). host_escrow gains the IDENTITY
// blob (age-wrapped {tunnel_token, pbs_token}) + the NON-secret DR directive (pbs repo/namespace,
// expected key fingerprint, tunnel id) — the hub serves these only in recovery mode; no usable
// secret is hub-held (the blobs need R, which the hub never has).
s.db.Exec(`ALTER TABLE hosts ADD COLUMN recovery_mode_until DATETIME`)
s.db.Exec(`ALTER TABLE host_escrow ADD COLUMN identity_blob BLOB`)
s.db.Exec(`ALTER TABLE host_escrow ADD COLUMN directive_json TEXT NOT NULL DEFAULT '{}'`)
return nil
}
@@ -1287,10 +1297,16 @@ type Host struct {
DesiredJSON string
DesiredGeneration int64
DRRecordJSON string
RecoveryModeUntil *time.Time // slice 10D: recovery mode active until this time (nil/past = off)
CreatedAt time.Time
UpdatedAt time.Time
}
// InRecoveryMode reports whether the host is currently in recovery mode (set + not expired).
func (h *Host) InRecoveryMode(now time.Time) bool {
return h.RecoveryModeUntil != nil && now.Before(*h.RecoveryModeUntil)
}
// Guest is one controller LXC. Reality columns are report-driven; APIKey and
// DesiredSpecJSON are INERT until slice 10 and must survive report upserts.
type Guest struct {
@@ -1335,10 +1351,10 @@ func GuestID(hostID string, vmid int) string {
func scanHost(scan func(dest ...any) error) (*Host, error) {
var h Host
var lastReport sql.NullString
var lastReport, recoveryUntil sql.NullString
var createdAt, updatedAt string
err := scan(&h.HostID, &h.CustomerID, &h.APIKey, &h.AgentVersion, &lastReport,
&h.DesiredJSON, &h.DesiredGeneration, &h.DRRecordJSON, &createdAt, &updatedAt)
&h.DesiredJSON, &h.DesiredGeneration, &h.DRRecordJSON, &recoveryUntil, &createdAt, &updatedAt)
if err != nil {
return nil, err
}
@@ -1346,13 +1362,17 @@ func scanHost(scan func(dest ...any) error) (*Host, error) {
t := parseSQLiteTime(lastReport.String)
h.LastReportAt = &t
}
if recoveryUntil.Valid && recoveryUntil.String != "" {
t := parseSQLiteTime(recoveryUntil.String)
h.RecoveryModeUntil = &t
}
h.CreatedAt = parseSQLiteTime(createdAt)
h.UpdatedAt = parseSQLiteTime(updatedAt)
return &h, nil
}
const hostSelectCols = `host_id, customer_id, api_key, agent_version, last_report_at,
desired_json, desired_generation, dr_record_json, created_at, updated_at`
desired_json, desired_generation, dr_record_json, recovery_mode_until, created_at, updated_at`
// GetHostByAPIKey looks up a host by its per-host hub key. Returns nil (no error)
// if no match — parallels GetCustomerConfigByAPIKey.
@@ -1525,6 +1545,87 @@ func (s *Store) DeleteSignedJob(hostID, jobID string) error {
return err
}
// ---- slice 10D: DR capstone (recovery mode, DR bundle, re-enroll) ----------------------------
// SetRecoveryMode arms recovery mode for a host until `until` (the operator toggle; bounded
// auto-expiry). While active, the hub serves the restore directive + allows re-enroll. Errors
// ErrNoRows for an unknown host.
func (s *Store) SetRecoveryMode(hostID string, until time.Time) error {
res, err := s.db.Exec(`UPDATE hosts SET recovery_mode_until = ?, updated_at = datetime('now') WHERE host_id = ?`,
until.UTC().Format("2006-01-02 15:04:05"), hostID)
if err != nil {
return err
}
if n, _ := res.RowsAffected(); n == 0 {
return sql.ErrNoRows
}
return nil
}
// ClearRecoveryMode disables recovery mode (operator confirm, or after re-enroll completes).
func (s *Store) ClearRecoveryMode(hostID string) error {
_, err := s.db.Exec(`UPDATE hosts SET recovery_mode_until = NULL, updated_at = datetime('now') WHERE host_id = ?`, hostID)
return err
}
// RotateHostAPIKey replaces a host's API key (the re-enroll credential rotation — the old box's hub
// access is revoked the instant this commits; purely hub-internal, no Cloudflare/PBS write needed).
func (s *Store) RotateHostAPIKey(hostID, newAPIKey string) error {
res, err := s.db.Exec(`UPDATE hosts SET api_key = ?, updated_at = datetime('now') WHERE host_id = ?`, newAPIKey, hostID)
if err != nil {
return err
}
if n, _ := res.RowsAffected(); n == 0 {
return sql.ErrNoRows
}
return nil
}
// SaveHostDRBundle stores the IDENTITY escrow blob + the NON-secret DR directive alongside the
// existing K-escrow blob (slice 10D.1). The K-escrow row must already exist (slice-7 escrow upload);
// this updates the additive 10D columns. The hub holds only ciphertext + non-secret directive.
func (s *Store) SaveHostDRBundle(hostID string, identityBlob []byte, directiveJSON string) error {
if directiveJSON == "" {
directiveJSON = "{}"
}
res, err := s.db.Exec(`UPDATE host_escrow SET identity_blob = ?, directive_json = ?, updated_at = datetime('now') WHERE host_id = ?`,
identityBlob, directiveJSON, hostID)
if err != nil {
return err
}
if n, _ := res.RowsAffected(); n == 0 {
return sql.ErrNoRows // no K-escrow row yet — upload the escrow first
}
return nil
}
// HostDRBundle is the full DR directive served to a re-enrolling box (slice 10D): the two OPAQUE
// escrow blobs (K + identity — useless without R) + the non-secret directive fields.
type HostDRBundle struct {
KEscrowBlob []byte
IdentityBlob []byte
DirectiveJSON string
}
// GetHostDRBundle returns a host's DR bundle (nil if no escrow row). The blobs are opaque — the hub
// cannot open them (it has no R).
func (s *Store) GetHostDRBundle(hostID string) (*HostDRBundle, error) {
var b HostDRBundle
var directive sql.NullString
err := s.db.QueryRow(`SELECT blob, identity_blob, directive_json FROM host_escrow WHERE host_id = ?`, hostID).
Scan(&b.KEscrowBlob, &b.IdentityBlob, &directive)
if err == sql.ErrNoRows {
return nil, nil
}
if err != nil {
return nil, err
}
if directive.Valid {
b.DirectiveJSON = directive.String
}
return &b, nil
}
// SaveHostReport inserts a host_reports row and bumps the host's reality columns
// (agent_version/last_report_at/updated_at) — never the inert intent columns.
func (s *Store) SaveHostReport(hostID, customerID string, reportJSON []byte, d HostReportDenorm) error {