slice 10D (hub): DR capstone — recovery mode + re-enroll + directive serving (hub v0.11.0)
Recovery-mode toggle (global key, bounded auto-expiry) gates re-enroll + restore-directive serving. Re-enroll rotates the agent<->hub credential to the new box (old key revoked); returns the opaque escrow blobs + non-secret directive. Store gains recovery_mode_until + identity_blob + directive_json. Hub holds no usable secret + no Cloudflare write-power (operator-side rotation). Doc 03 §9: slice 10 CLOSED. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+104
-3
@@ -301,6 +301,16 @@ func (s *Store) migrate() error {
|
||||
return err
|
||||
}
|
||||
|
||||
// Slice 10D (DR capstone) — additive columns on existing tables (fire-and-forget; a duplicate
|
||||
// column on re-run is ignored). `recovery_mode_until` gates restore-directive serving + re-enroll
|
||||
// (NULL/past = off; future = recovery mode active, auto-expires). host_escrow gains the IDENTITY
|
||||
// blob (age-wrapped {tunnel_token, pbs_token}) + the NON-secret DR directive (pbs repo/namespace,
|
||||
// expected key fingerprint, tunnel id) — the hub serves these only in recovery mode; no usable
|
||||
// secret is hub-held (the blobs need R, which the hub never has).
|
||||
s.db.Exec(`ALTER TABLE hosts ADD COLUMN recovery_mode_until DATETIME`)
|
||||
s.db.Exec(`ALTER TABLE host_escrow ADD COLUMN identity_blob BLOB`)
|
||||
s.db.Exec(`ALTER TABLE host_escrow ADD COLUMN directive_json TEXT NOT NULL DEFAULT '{}'`)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -1287,10 +1297,16 @@ type Host struct {
|
||||
DesiredJSON string
|
||||
DesiredGeneration int64
|
||||
DRRecordJSON string
|
||||
RecoveryModeUntil *time.Time // slice 10D: recovery mode active until this time (nil/past = off)
|
||||
CreatedAt time.Time
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
// InRecoveryMode reports whether the host is currently in recovery mode (set + not expired).
|
||||
func (h *Host) InRecoveryMode(now time.Time) bool {
|
||||
return h.RecoveryModeUntil != nil && now.Before(*h.RecoveryModeUntil)
|
||||
}
|
||||
|
||||
// Guest is one controller LXC. Reality columns are report-driven; APIKey and
|
||||
// DesiredSpecJSON are INERT until slice 10 and must survive report upserts.
|
||||
type Guest struct {
|
||||
@@ -1335,10 +1351,10 @@ func GuestID(hostID string, vmid int) string {
|
||||
|
||||
func scanHost(scan func(dest ...any) error) (*Host, error) {
|
||||
var h Host
|
||||
var lastReport sql.NullString
|
||||
var lastReport, recoveryUntil sql.NullString
|
||||
var createdAt, updatedAt string
|
||||
err := scan(&h.HostID, &h.CustomerID, &h.APIKey, &h.AgentVersion, &lastReport,
|
||||
&h.DesiredJSON, &h.DesiredGeneration, &h.DRRecordJSON, &createdAt, &updatedAt)
|
||||
&h.DesiredJSON, &h.DesiredGeneration, &h.DRRecordJSON, &recoveryUntil, &createdAt, &updatedAt)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -1346,13 +1362,17 @@ func scanHost(scan func(dest ...any) error) (*Host, error) {
|
||||
t := parseSQLiteTime(lastReport.String)
|
||||
h.LastReportAt = &t
|
||||
}
|
||||
if recoveryUntil.Valid && recoveryUntil.String != "" {
|
||||
t := parseSQLiteTime(recoveryUntil.String)
|
||||
h.RecoveryModeUntil = &t
|
||||
}
|
||||
h.CreatedAt = parseSQLiteTime(createdAt)
|
||||
h.UpdatedAt = parseSQLiteTime(updatedAt)
|
||||
return &h, nil
|
||||
}
|
||||
|
||||
const hostSelectCols = `host_id, customer_id, api_key, agent_version, last_report_at,
|
||||
desired_json, desired_generation, dr_record_json, created_at, updated_at`
|
||||
desired_json, desired_generation, dr_record_json, recovery_mode_until, created_at, updated_at`
|
||||
|
||||
// GetHostByAPIKey looks up a host by its per-host hub key. Returns nil (no error)
|
||||
// if no match — parallels GetCustomerConfigByAPIKey.
|
||||
@@ -1525,6 +1545,87 @@ func (s *Store) DeleteSignedJob(hostID, jobID string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
// ---- slice 10D: DR capstone (recovery mode, DR bundle, re-enroll) ----------------------------
|
||||
|
||||
// SetRecoveryMode arms recovery mode for a host until `until` (the operator toggle; bounded
|
||||
// auto-expiry). While active, the hub serves the restore directive + allows re-enroll. Errors
|
||||
// ErrNoRows for an unknown host.
|
||||
func (s *Store) SetRecoveryMode(hostID string, until time.Time) error {
|
||||
res, err := s.db.Exec(`UPDATE hosts SET recovery_mode_until = ?, updated_at = datetime('now') WHERE host_id = ?`,
|
||||
until.UTC().Format("2006-01-02 15:04:05"), hostID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if n, _ := res.RowsAffected(); n == 0 {
|
||||
return sql.ErrNoRows
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ClearRecoveryMode disables recovery mode (operator confirm, or after re-enroll completes).
|
||||
func (s *Store) ClearRecoveryMode(hostID string) error {
|
||||
_, err := s.db.Exec(`UPDATE hosts SET recovery_mode_until = NULL, updated_at = datetime('now') WHERE host_id = ?`, hostID)
|
||||
return err
|
||||
}
|
||||
|
||||
// RotateHostAPIKey replaces a host's API key (the re-enroll credential rotation — the old box's hub
|
||||
// access is revoked the instant this commits; purely hub-internal, no Cloudflare/PBS write needed).
|
||||
func (s *Store) RotateHostAPIKey(hostID, newAPIKey string) error {
|
||||
res, err := s.db.Exec(`UPDATE hosts SET api_key = ?, updated_at = datetime('now') WHERE host_id = ?`, newAPIKey, hostID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if n, _ := res.RowsAffected(); n == 0 {
|
||||
return sql.ErrNoRows
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// SaveHostDRBundle stores the IDENTITY escrow blob + the NON-secret DR directive alongside the
|
||||
// existing K-escrow blob (slice 10D.1). The K-escrow row must already exist (slice-7 escrow upload);
|
||||
// this updates the additive 10D columns. The hub holds only ciphertext + non-secret directive.
|
||||
func (s *Store) SaveHostDRBundle(hostID string, identityBlob []byte, directiveJSON string) error {
|
||||
if directiveJSON == "" {
|
||||
directiveJSON = "{}"
|
||||
}
|
||||
res, err := s.db.Exec(`UPDATE host_escrow SET identity_blob = ?, directive_json = ?, updated_at = datetime('now') WHERE host_id = ?`,
|
||||
identityBlob, directiveJSON, hostID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if n, _ := res.RowsAffected(); n == 0 {
|
||||
return sql.ErrNoRows // no K-escrow row yet — upload the escrow first
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// HostDRBundle is the full DR directive served to a re-enrolling box (slice 10D): the two OPAQUE
|
||||
// escrow blobs (K + identity — useless without R) + the non-secret directive fields.
|
||||
type HostDRBundle struct {
|
||||
KEscrowBlob []byte
|
||||
IdentityBlob []byte
|
||||
DirectiveJSON string
|
||||
}
|
||||
|
||||
// GetHostDRBundle returns a host's DR bundle (nil if no escrow row). The blobs are opaque — the hub
|
||||
// cannot open them (it has no R).
|
||||
func (s *Store) GetHostDRBundle(hostID string) (*HostDRBundle, error) {
|
||||
var b HostDRBundle
|
||||
var directive sql.NullString
|
||||
err := s.db.QueryRow(`SELECT blob, identity_blob, directive_json FROM host_escrow WHERE host_id = ?`, hostID).
|
||||
Scan(&b.KEscrowBlob, &b.IdentityBlob, &directive)
|
||||
if err == sql.ErrNoRows {
|
||||
return nil, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if directive.Valid {
|
||||
b.DirectiveJSON = directive.String
|
||||
}
|
||||
return &b, nil
|
||||
}
|
||||
|
||||
// SaveHostReport inserts a host_reports row and bumps the host's reality columns
|
||||
// (agent_version/last_report_at/updated_at) — never the inert intent columns.
|
||||
func (s *Store) SaveHostReport(hostID, customerID string, reportJSON []byte, d HostReportDenorm) error {
|
||||
|
||||
Reference in New Issue
Block a user