v0.4.0: slice 4 Phase B — reversibility gate + signed-op consuming layer
The security core of slice 4: hub-supplied intent is no longer trusted for destructive change. The gate fronts the per-guest queue's executor, so every mutation passes it. Reuses internal/authz for all crypto (surface untouched). - Classifier (doc 03 §4): benign vs destructive by provenance + data-bearing- ness, NOT by verb. Destroy/overwrite of customer data is destructive unless agent-internal provenance (same-journaled-txn create, or agent-tagged scratch) makes it benign — and that provenance is journal-recorded, NEVER hub-sourced. Unknown op class fails safe to destructive. - Reversibility gate: benign -> allowed unsigned; destructive -> requires a verified, role-scoped, action-bound operator signature, else pending_signature and never executed. Every decision audited (signal, never the guard). - Signed-op consuming layer over authz.Verifier.Verify (locked pipeline untouched): role-scoping (doc 04 §4 — recovery=rotation only, operational= ordinary destructive + planned rotation) + op-to-action binding (op+host+ guest+params must match the gated action). - Signed-job orchestration: idempotency dedupe by nonce + journal-wrapped execution via an injected DestructiveExecutor (nil this slice — inert). - Crash recovery (Note 1): Engine.Recover consumes the journal InFlight() set at startup (resume-or-rollback) — covers an op that crashed after the POST and before its terminal record, which idempotency dedupe alone cannot. Added TaskStatusOnce to the GuestAPI seam. Wired into daemon startup. - Note 2: memory comparison canonicalized to MiB (desiredMemoryMiB) so a non-MiB-aligned MemoryBytes converges in one pass, not perpetual drift. - Daemon: builds the verifier from config signers (none = nil verifier, the common slice-4 state), the gate (+SlogAudit), runs Recover before mutating. Adversarial matrix proven against the REAL authz.Verifier with in-test-minted SSHSIGs (framing replicated in reconcile's test binary; authz untouched, no signing added to the verify-only package): unsigned job + unsigned desired-state delta -> pending_signature; unknown signer/expired/replay-across-restart/wrong host -> typed authz rejections; wrong guest/op/params -> binding_mismatch; recovery key on ordinary destructive -> role_denied; hub-supplied scratch tag ignored -> refused; valid+role+target+fresh nonce -> accepted then replay rejected. Full module race-clean + vet-clean on the Linux build server. Inert this slice: no destructive deltas served until slice 10; the destructive path is classified, gated, and tested but not wired to live execution. CHECKPOINT: Phase B complete (slice 4 done). Awaiting validation. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,103 @@
|
||||
package reconcile
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-agent/internal/proxmox"
|
||||
)
|
||||
|
||||
func seedInFlight(t *testing.T, j *Journal, e JournalEntry) {
|
||||
t.Helper()
|
||||
e.State = OpTaskRunning
|
||||
if e.At.IsZero() {
|
||||
e.At = time.Now().UTC()
|
||||
}
|
||||
if err := j.Append(e); err != nil {
|
||||
t.Fatalf("seed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecover_TaskCompletedOKMarksSucceeded(t *testing.T) {
|
||||
api := &fakeAPI{statusFunc: func(string) (proxmox.TaskStatus, error) {
|
||||
return proxmox.TaskStatus{Status: "stopped", ExitStatus: "OK"}, nil
|
||||
}}
|
||||
e, j, _ := newEngine(t, api, EmptyProvider{})
|
||||
seedInFlight(t, j, JournalEntry{OpID: "op1", VMID: 100, Kind: "set_config", UPID: "UPID:x:", IdempKey: "k1"})
|
||||
|
||||
res := e.Recover(context.Background())
|
||||
if res.Examined != 1 || res.Resumed != 1 {
|
||||
t.Fatalf("want 1 resumed, got %+v", res)
|
||||
}
|
||||
if len(j.InFlight()) != 0 {
|
||||
t.Errorf("resolved op should not be in-flight: %+v", j.InFlight())
|
||||
}
|
||||
// A resumed one-shot op marks its idempotency key applied (it really completed) —
|
||||
// this is the case idempotency-alone could not cover (Note 1).
|
||||
if !j.AlreadyApplied("k1") {
|
||||
t.Error("a recovered-succeeded op must mark its idempotency key applied")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecover_TaskEndedNonOKMarksFailed(t *testing.T) {
|
||||
api := &fakeAPI{statusFunc: func(string) (proxmox.TaskStatus, error) {
|
||||
return proxmox.TaskStatus{Status: "stopped", ExitStatus: "got 403"}, nil
|
||||
}}
|
||||
e, j, _ := newEngine(t, api, EmptyProvider{})
|
||||
seedInFlight(t, j, JournalEntry{OpID: "op2", VMID: 100, Kind: "guest_destroy", UPID: "UPID:x:", IdempKey: "k2"})
|
||||
|
||||
res := e.Recover(context.Background())
|
||||
if res.Failed != 1 {
|
||||
t.Fatalf("want 1 failed, got %+v", res)
|
||||
}
|
||||
if j.AlreadyApplied("k2") {
|
||||
t.Error("a failed op must NOT mark its key applied (it may be retried)")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecover_TaskStillRunningLeftInFlight(t *testing.T) {
|
||||
api := &fakeAPI{statusFunc: func(string) (proxmox.TaskStatus, error) {
|
||||
return proxmox.TaskStatus{Status: "running"}, nil
|
||||
}}
|
||||
e, j, _ := newEngine(t, api, EmptyProvider{})
|
||||
seedInFlight(t, j, JournalEntry{OpID: "op3", VMID: 100, Kind: "set_config", UPID: "UPID:x:"})
|
||||
|
||||
res := e.Recover(context.Background())
|
||||
if res.StillRunning != 1 || len(j.InFlight()) != 1 {
|
||||
t.Fatalf("still-running task must be left in-flight, got res=%+v inflight=%d", res, len(j.InFlight()))
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecover_NoTaskIDRolledBack(t *testing.T) {
|
||||
// OpStarted with no UPID: the POST was never confirmed → abandon (fail-safe).
|
||||
e, j, _ := newEngine(t, &fakeAPI{}, EmptyProvider{})
|
||||
if err := j.Append(JournalEntry{OpID: "op4", VMID: 100, Kind: "start", State: OpStarted, At: time.Now().UTC()}); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
res := e.Recover(context.Background())
|
||||
if res.RolledBack != 1 || len(j.InFlight()) != 0 {
|
||||
t.Fatalf("no-task op must be rolled back, got res=%+v inflight=%d", res, len(j.InFlight()))
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecover_UnreadableStatusLeftInFlight(t *testing.T) {
|
||||
api := &fakeAPI{statusFunc: func(string) (proxmox.TaskStatus, error) {
|
||||
return proxmox.TaskStatus{}, errors.New("api unreachable")
|
||||
}}
|
||||
e, j, _ := newEngine(t, api, EmptyProvider{})
|
||||
seedInFlight(t, j, JournalEntry{OpID: "op5", VMID: 100, Kind: "set_config", UPID: "UPID:x:"})
|
||||
|
||||
res := e.Recover(context.Background())
|
||||
if res.Unresolved != 1 || len(j.InFlight()) != 1 {
|
||||
t.Fatalf("unreadable status must leave op in-flight, got res=%+v inflight=%d", res, len(j.InFlight()))
|
||||
}
|
||||
}
|
||||
|
||||
func TestRecover_EmptyJournalNoop(t *testing.T) {
|
||||
e, _, _ := newEngine(t, &fakeAPI{}, EmptyProvider{})
|
||||
if res := e.Recover(context.Background()); res.Examined != 0 {
|
||||
t.Errorf("empty journal recover should be a no-op, got %+v", res)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user