v0.4.0-rc1: slice 4 Phase A — reconcile engine (structural, runs live unfed)

New internal/reconcile package: the agent-side control core's structural half.

- Per-guest serializer Queue (doc 03 §10): the single choke point all mutation
  sources funnel through; same-vmid serial in submit order, different vmids
  parallel (cond-var FIFO lanes).
- Desired-state model + DesiredProvider seam; EmptyProvider is the only live
  source at slice 4 (no hub serving until slice 10) so the live engine computes
  an empty action set and performs zero mutations.
- Normalization layer (FieldNormalizers): normalized desired-vs-actual so
  Proxmox round-trip quirks don't read as drift. normDesc promoted out of
  main.go to reconcile.NormDescription; selftest uses the shared helper.
- Plan (pure diff): minimal benign action set (Start/Stop/SetConfig) for guests
  in both desired and actual; provision/destroy out of scope here.
- Engine: dispatches onto the shared queue; honors the dual-mode SetConfig
  contract (UPID -> WaitTask; empty UPID -> synchronous success).
- Durable op journal + idempotency store (mirrors authz.FileNonceStore):
  in-flight task ids for crash detection + AlreadyApplied dedupe across restart.
- Wired into runDaemon alongside the hub loop, sharing the queue; runs cleanly
  with no desired state and no signers.

Full module race-clean and vet-clean on the Linux build server.

CHECKPOINT: Phase A only. Awaiting validation before Phase B (the reversibility
gate + signed-op consuming layer, landing v0.4.0).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-08 23:21:55 +02:00
parent 605ce25f58
commit 05c450147c
16 changed files with 1904 additions and 78 deletions
+212
View File
@@ -0,0 +1,212 @@
package reconcile
import (
"context"
"errors"
"path/filepath"
"sync"
"testing"
"gitea.dooplex.hu/admin/felhom-agent/internal/hub"
"gitea.dooplex.hu/admin/felhom-agent/internal/proxmox"
)
// fakeAPI is a configurable GuestAPI for engine tests: it records mutating calls and
// returns canned UPIDs (""=synchronous, non-empty=async) and WaitTask verdicts.
type fakeAPI struct {
mu sync.Mutex
lxc []proxmox.Guest
cfg map[int]proxmox.GuestConfig
startUPID, stopUPID, setUPID string
startErr, stopErr, setErr error
// waitFunc maps a UPID to a (status, err); default = OK. Mirrors the real client,
// which errors on a non-OK exitstatus.
waitFunc func(upid string) (proxmox.TaskStatus, error)
starts []int
stops []int
sets []setCall
waits []string
listErr error
}
type setCall struct {
vmid int
params map[string]string
}
func (f *fakeAPI) ListLXC(context.Context) ([]proxmox.Guest, error) {
if f.listErr != nil {
return nil, f.listErr
}
return f.lxc, nil
}
func (f *fakeAPI) GuestConfig(_ context.Context, vmid int) (proxmox.GuestConfig, error) {
c, ok := f.cfg[vmid]
if !ok {
return proxmox.GuestConfig{}, errors.New("no config")
}
return c, nil
}
func (f *fakeAPI) Start(_ context.Context, vmid int) (string, error) {
f.mu.Lock()
f.starts = append(f.starts, vmid)
f.mu.Unlock()
return f.startUPID, f.startErr
}
func (f *fakeAPI) Stop(_ context.Context, vmid int) (string, error) {
f.mu.Lock()
f.stops = append(f.stops, vmid)
f.mu.Unlock()
return f.stopUPID, f.stopErr
}
func (f *fakeAPI) SetConfig(_ context.Context, vmid int, params map[string]string) (string, error) {
f.mu.Lock()
f.sets = append(f.sets, setCall{vmid, params})
f.mu.Unlock()
return f.setUPID, f.setErr
}
func (f *fakeAPI) WaitTask(_ context.Context, upid string, _ proxmox.WaitOptions) (proxmox.TaskStatus, error) {
f.mu.Lock()
f.waits = append(f.waits, upid)
f.mu.Unlock()
if f.waitFunc != nil {
return f.waitFunc(upid)
}
return proxmox.TaskStatus{Status: "stopped", ExitStatus: "OK"}, nil
}
func newEngine(t *testing.T, api GuestAPI, provider DesiredProvider) (*Engine, *Journal, *Queue) {
t.Helper()
jp := filepath.Join(t.TempDir(), "journal.log")
j, err := OpenJournal(jp)
if err != nil {
t.Fatalf("OpenJournal: %v", err)
}
t.Cleanup(func() { j.Close() })
q := NewQueue()
t.Cleanup(q.Close)
e := NewEngine(EngineOptions{API: api, Queue: q, Journal: j, Provider: provider})
return e, j, q
}
func TestEngine_EmptyProviderNoMutations(t *testing.T) {
api := &fakeAPI{
lxc: []proxmox.Guest{{VMID: 100, Status: "running"}},
cfg: map[int]proxmox.GuestConfig{100: {Cores: 2}},
}
e, _, _ := newEngine(t, api, EmptyProvider{})
res, err := e.Reconcile(context.Background())
if err != nil {
t.Fatalf("Reconcile: %v", err)
}
if res.Planned != 0 || res.Executed != 0 {
t.Errorf("EmptyProvider should plan nothing, got %+v", res)
}
if len(api.starts)+len(api.stops)+len(api.sets) != 0 {
t.Errorf("EmptyProvider mutated Proxmox: starts=%v stops=%v sets=%v", api.starts, api.stops, api.sets)
}
}
func TestEngine_AsyncStartWaitsTask(t *testing.T) {
api := &fakeAPI{
lxc: []proxmox.Guest{{VMID: 100, Status: "stopped"}},
cfg: map[int]proxmox.GuestConfig{100: {Cores: 2}},
startUPID: "UPID:demo:start:100:",
}
e, j, _ := newEngine(t, api, StaticProvider{State: desired(DesiredGuest{VMID: 100, Run: RunRunning})})
res, err := e.Reconcile(context.Background())
if err != nil {
t.Fatalf("Reconcile: %v", err)
}
if res.Executed != 1 || res.Failed != 0 {
t.Fatalf("want 1 executed, got %+v", res)
}
if len(api.starts) != 1 || api.starts[0] != 100 {
t.Errorf("expected Start(100), got %v", api.starts)
}
if len(api.waits) != 1 {
t.Errorf("async op must WaitTask, got waits=%v", api.waits)
}
if len(j.InFlight()) != 0 {
t.Errorf("no ops should be in-flight after success: %+v", j.InFlight())
}
}
func TestEngine_SynchronousSetConfigNoWait(t *testing.T) {
// Empty UPID = PVE applied synchronously (slice-4 proven for description). Must be
// treated as success WITHOUT a WaitTask call.
api := &fakeAPI{
lxc: []proxmox.Guest{{VMID: 100, Status: "stopped"}},
cfg: map[int]proxmox.GuestConfig{100: {Cores: 2}},
setUPID: "", // synchronous
}
e, _, _ := newEngine(t, api, StaticProvider{State: desired(
DesiredGuest{VMID: 100, Spec: &hub.GuestSpec{Cores: 4, MemoryBytes: mib(2048)}})})
res, err := e.Reconcile(context.Background())
if err != nil {
t.Fatalf("Reconcile: %v", err)
}
if res.Executed != 1 {
t.Fatalf("want 1 executed, got %+v", res)
}
if len(api.sets) != 1 || api.sets[0].params["cores"] != "4" {
t.Errorf("expected SetConfig cores=4, got %v", api.sets)
}
if len(api.waits) != 0 {
t.Errorf("synchronous op must NOT WaitTask, got waits=%v", api.waits)
}
}
func TestEngine_WaitTaskFailureCountsFailed(t *testing.T) {
api := &fakeAPI{
lxc: []proxmox.Guest{{VMID: 100, Status: "stopped"}},
cfg: map[int]proxmox.GuestConfig{100: {Cores: 2}},
startUPID: "UPID:demo:start:100:",
waitFunc: func(string) (proxmox.TaskStatus, error) {
return proxmox.TaskStatus{Status: "stopped", ExitStatus: "got 403"}, errors.New("task failed: got 403")
},
}
e, j, _ := newEngine(t, api, StaticProvider{State: desired(DesiredGuest{VMID: 100, Run: RunRunning})})
res, err := e.Reconcile(context.Background())
if err != nil {
t.Fatalf("Reconcile (pass): %v", err)
}
if res.Failed != 1 || res.Executed != 0 {
t.Fatalf("want 1 failed, got %+v", res)
}
// The failed op is journaled terminal (failed), not left in-flight.
if len(j.InFlight()) != 0 {
t.Errorf("failed op should be terminal, in-flight=%+v", j.InFlight())
}
}
func TestEngine_PostErrorCountsFailed(t *testing.T) {
api := &fakeAPI{
lxc: []proxmox.Guest{{VMID: 100, Status: "stopped"}},
cfg: map[int]proxmox.GuestConfig{100: {Cores: 2}},
startErr: errors.New("connection refused"),
}
e, _, _ := newEngine(t, api, StaticProvider{State: desired(DesiredGuest{VMID: 100, Run: RunRunning})})
res, _ := e.Reconcile(context.Background())
if res.Failed != 1 {
t.Fatalf("want 1 failed on POST error, got %+v", res)
}
if len(api.waits) != 0 {
t.Errorf("POST error must not reach WaitTask, got %v", api.waits)
}
}
func TestEngine_ListErrorIsPassFailure(t *testing.T) {
api := &fakeAPI{listErr: errors.New("api down")}
e, _, _ := newEngine(t, api, StaticProvider{State: desired(DesiredGuest{VMID: 100, Run: RunRunning})})
if _, err := e.Reconcile(context.Background()); err == nil {
t.Error("expected a pass-level error when actual state can't be read")
}
}