Files
felhom-agent/internal/reconcile/engine_test.go
T
admin 1af21a6cac v0.4.0: slice 4 Phase B — reversibility gate + signed-op consuming layer
The security core of slice 4: hub-supplied intent is no longer trusted for
destructive change. The gate fronts the per-guest queue's executor, so every
mutation passes it. Reuses internal/authz for all crypto (surface untouched).

- Classifier (doc 03 §4): benign vs destructive by provenance + data-bearing-
  ness, NOT by verb. Destroy/overwrite of customer data is destructive unless
  agent-internal provenance (same-journaled-txn create, or agent-tagged scratch)
  makes it benign — and that provenance is journal-recorded, NEVER hub-sourced.
  Unknown op class fails safe to destructive.
- Reversibility gate: benign -> allowed unsigned; destructive -> requires a
  verified, role-scoped, action-bound operator signature, else pending_signature
  and never executed. Every decision audited (signal, never the guard).
- Signed-op consuming layer over authz.Verifier.Verify (locked pipeline
  untouched): role-scoping (doc 04 §4 — recovery=rotation only, operational=
  ordinary destructive + planned rotation) + op-to-action binding (op+host+
  guest+params must match the gated action).
- Signed-job orchestration: idempotency dedupe by nonce + journal-wrapped
  execution via an injected DestructiveExecutor (nil this slice — inert).
- Crash recovery (Note 1): Engine.Recover consumes the journal InFlight() set at
  startup (resume-or-rollback) — covers an op that crashed after the POST and
  before its terminal record, which idempotency dedupe alone cannot. Added
  TaskStatusOnce to the GuestAPI seam. Wired into daemon startup.
- Note 2: memory comparison canonicalized to MiB (desiredMemoryMiB) so a
  non-MiB-aligned MemoryBytes converges in one pass, not perpetual drift.
- Daemon: builds the verifier from config signers (none = nil verifier, the
  common slice-4 state), the gate (+SlogAudit), runs Recover before mutating.

Adversarial matrix proven against the REAL authz.Verifier with in-test-minted
SSHSIGs (framing replicated in reconcile's test binary; authz untouched, no
signing added to the verify-only package): unsigned job + unsigned desired-state
delta -> pending_signature; unknown signer/expired/replay-across-restart/wrong
host -> typed authz rejections; wrong guest/op/params -> binding_mismatch;
recovery key on ordinary destructive -> role_denied; hub-supplied scratch tag
ignored -> refused; valid+role+target+fresh nonce -> accepted then replay
rejected. Full module race-clean + vet-clean on the Linux build server.

Inert this slice: no destructive deltas served until slice 10; the destructive
path is classified, gated, and tested but not wired to live execution.

CHECKPOINT: Phase B complete (slice 4 done). Awaiting validation.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-08 23:56:20 +02:00

222 lines
6.8 KiB
Go

package reconcile
import (
"context"
"errors"
"path/filepath"
"sync"
"testing"
"gitea.dooplex.hu/admin/felhom-agent/internal/hub"
"gitea.dooplex.hu/admin/felhom-agent/internal/proxmox"
)
// fakeAPI is a configurable GuestAPI for engine tests: it records mutating calls and
// returns canned UPIDs (""=synchronous, non-empty=async) and WaitTask verdicts.
type fakeAPI struct {
mu sync.Mutex
lxc []proxmox.Guest
cfg map[int]proxmox.GuestConfig
startUPID, stopUPID, setUPID string
startErr, stopErr, setErr error
// waitFunc maps a UPID to a (status, err); default = OK. Mirrors the real client,
// which errors on a non-OK exitstatus.
waitFunc func(upid string) (proxmox.TaskStatus, error)
// statusFunc backs TaskStatusOnce (crash recovery); default = stopped/OK.
statusFunc func(upid string) (proxmox.TaskStatus, error)
starts []int
stops []int
sets []setCall
waits []string
listErr error
}
func (f *fakeAPI) TaskStatusOnce(_ context.Context, upid string) (proxmox.TaskStatus, error) {
if f.statusFunc != nil {
return f.statusFunc(upid)
}
return proxmox.TaskStatus{UPID: upid, Status: "stopped", ExitStatus: "OK"}, nil
}
type setCall struct {
vmid int
params map[string]string
}
func (f *fakeAPI) ListLXC(context.Context) ([]proxmox.Guest, error) {
if f.listErr != nil {
return nil, f.listErr
}
return f.lxc, nil
}
func (f *fakeAPI) GuestConfig(_ context.Context, vmid int) (proxmox.GuestConfig, error) {
c, ok := f.cfg[vmid]
if !ok {
return proxmox.GuestConfig{}, errors.New("no config")
}
return c, nil
}
func (f *fakeAPI) Start(_ context.Context, vmid int) (string, error) {
f.mu.Lock()
f.starts = append(f.starts, vmid)
f.mu.Unlock()
return f.startUPID, f.startErr
}
func (f *fakeAPI) Stop(_ context.Context, vmid int) (string, error) {
f.mu.Lock()
f.stops = append(f.stops, vmid)
f.mu.Unlock()
return f.stopUPID, f.stopErr
}
func (f *fakeAPI) SetConfig(_ context.Context, vmid int, params map[string]string) (string, error) {
f.mu.Lock()
f.sets = append(f.sets, setCall{vmid, params})
f.mu.Unlock()
return f.setUPID, f.setErr
}
func (f *fakeAPI) WaitTask(_ context.Context, upid string, _ proxmox.WaitOptions) (proxmox.TaskStatus, error) {
f.mu.Lock()
f.waits = append(f.waits, upid)
f.mu.Unlock()
if f.waitFunc != nil {
return f.waitFunc(upid)
}
return proxmox.TaskStatus{Status: "stopped", ExitStatus: "OK"}, nil
}
func newEngine(t *testing.T, api GuestAPI, provider DesiredProvider) (*Engine, *Journal, *Queue) {
t.Helper()
jp := filepath.Join(t.TempDir(), "journal.log")
j, err := OpenJournal(jp)
if err != nil {
t.Fatalf("OpenJournal: %v", err)
}
t.Cleanup(func() { j.Close() })
q := NewQueue()
t.Cleanup(q.Close)
e := NewEngine(EngineOptions{API: api, Queue: q, Journal: j, Provider: provider})
return e, j, q
}
func TestEngine_EmptyProviderNoMutations(t *testing.T) {
api := &fakeAPI{
lxc: []proxmox.Guest{{VMID: 100, Status: "running"}},
cfg: map[int]proxmox.GuestConfig{100: {Cores: 2}},
}
e, _, _ := newEngine(t, api, EmptyProvider{})
res, err := e.Reconcile(context.Background())
if err != nil {
t.Fatalf("Reconcile: %v", err)
}
if res.Planned != 0 || res.Executed != 0 {
t.Errorf("EmptyProvider should plan nothing, got %+v", res)
}
if len(api.starts)+len(api.stops)+len(api.sets) != 0 {
t.Errorf("EmptyProvider mutated Proxmox: starts=%v stops=%v sets=%v", api.starts, api.stops, api.sets)
}
}
func TestEngine_AsyncStartWaitsTask(t *testing.T) {
api := &fakeAPI{
lxc: []proxmox.Guest{{VMID: 100, Status: "stopped"}},
cfg: map[int]proxmox.GuestConfig{100: {Cores: 2}},
startUPID: "UPID:demo:start:100:",
}
e, j, _ := newEngine(t, api, StaticProvider{State: desired(DesiredGuest{VMID: 100, Run: RunRunning})})
res, err := e.Reconcile(context.Background())
if err != nil {
t.Fatalf("Reconcile: %v", err)
}
if res.Executed != 1 || res.Failed != 0 {
t.Fatalf("want 1 executed, got %+v", res)
}
if len(api.starts) != 1 || api.starts[0] != 100 {
t.Errorf("expected Start(100), got %v", api.starts)
}
if len(api.waits) != 1 {
t.Errorf("async op must WaitTask, got waits=%v", api.waits)
}
if len(j.InFlight()) != 0 {
t.Errorf("no ops should be in-flight after success: %+v", j.InFlight())
}
}
func TestEngine_SynchronousSetConfigNoWait(t *testing.T) {
// Empty UPID = PVE applied synchronously (slice-4 proven for description). Must be
// treated as success WITHOUT a WaitTask call.
api := &fakeAPI{
lxc: []proxmox.Guest{{VMID: 100, Status: "stopped"}},
cfg: map[int]proxmox.GuestConfig{100: {Cores: 2}},
setUPID: "", // synchronous
}
e, _, _ := newEngine(t, api, StaticProvider{State: desired(
DesiredGuest{VMID: 100, Spec: &hub.GuestSpec{Cores: 4, MemoryBytes: mib(2048)}})})
res, err := e.Reconcile(context.Background())
if err != nil {
t.Fatalf("Reconcile: %v", err)
}
if res.Executed != 1 {
t.Fatalf("want 1 executed, got %+v", res)
}
if len(api.sets) != 1 || api.sets[0].params["cores"] != "4" {
t.Errorf("expected SetConfig cores=4, got %v", api.sets)
}
if len(api.waits) != 0 {
t.Errorf("synchronous op must NOT WaitTask, got waits=%v", api.waits)
}
}
func TestEngine_WaitTaskFailureCountsFailed(t *testing.T) {
api := &fakeAPI{
lxc: []proxmox.Guest{{VMID: 100, Status: "stopped"}},
cfg: map[int]proxmox.GuestConfig{100: {Cores: 2}},
startUPID: "UPID:demo:start:100:",
waitFunc: func(string) (proxmox.TaskStatus, error) {
return proxmox.TaskStatus{Status: "stopped", ExitStatus: "got 403"}, errors.New("task failed: got 403")
},
}
e, j, _ := newEngine(t, api, StaticProvider{State: desired(DesiredGuest{VMID: 100, Run: RunRunning})})
res, err := e.Reconcile(context.Background())
if err != nil {
t.Fatalf("Reconcile (pass): %v", err)
}
if res.Failed != 1 || res.Executed != 0 {
t.Fatalf("want 1 failed, got %+v", res)
}
// The failed op is journaled terminal (failed), not left in-flight.
if len(j.InFlight()) != 0 {
t.Errorf("failed op should be terminal, in-flight=%+v", j.InFlight())
}
}
func TestEngine_PostErrorCountsFailed(t *testing.T) {
api := &fakeAPI{
lxc: []proxmox.Guest{{VMID: 100, Status: "stopped"}},
cfg: map[int]proxmox.GuestConfig{100: {Cores: 2}},
startErr: errors.New("connection refused"),
}
e, _, _ := newEngine(t, api, StaticProvider{State: desired(DesiredGuest{VMID: 100, Run: RunRunning})})
res, _ := e.Reconcile(context.Background())
if res.Failed != 1 {
t.Fatalf("want 1 failed on POST error, got %+v", res)
}
if len(api.waits) != 0 {
t.Errorf("POST error must not reach WaitTask, got %v", api.waits)
}
}
func TestEngine_ListErrorIsPassFailure(t *testing.T) {
api := &fakeAPI{listErr: errors.New("api down")}
e, _, _ := newEngine(t, api, StaticProvider{State: desired(DesiredGuest{VMID: 100, Run: RunRunning})})
if _, err := e.Reconcile(context.Background()); err == nil {
t.Error("expected a pass-level error when actual state can't be read")
}
}