v0.4.0-rc1: slice 4 Phase A — reconcile engine (structural, runs live unfed)

New internal/reconcile package: the agent-side control core's structural half.

- Per-guest serializer Queue (doc 03 §10): the single choke point all mutation
  sources funnel through; same-vmid serial in submit order, different vmids
  parallel (cond-var FIFO lanes).
- Desired-state model + DesiredProvider seam; EmptyProvider is the only live
  source at slice 4 (no hub serving until slice 10) so the live engine computes
  an empty action set and performs zero mutations.
- Normalization layer (FieldNormalizers): normalized desired-vs-actual so
  Proxmox round-trip quirks don't read as drift. normDesc promoted out of
  main.go to reconcile.NormDescription; selftest uses the shared helper.
- Plan (pure diff): minimal benign action set (Start/Stop/SetConfig) for guests
  in both desired and actual; provision/destroy out of scope here.
- Engine: dispatches onto the shared queue; honors the dual-mode SetConfig
  contract (UPID -> WaitTask; empty UPID -> synchronous success).
- Durable op journal + idempotency store (mirrors authz.FileNonceStore):
  in-flight task ids for crash detection + AlreadyApplied dedupe across restart.
- Wired into runDaemon alongside the hub loop, sharing the queue; runs cleanly
  with no desired state and no signers.

Full module race-clean and vet-clean on the Linux build server.

CHECKPOINT: Phase A only. Awaiting validation before Phase B (the reversibility
gate + signed-op consuming layer, landing v0.4.0).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-08 23:21:55 +02:00
parent 605ce25f58
commit 05c450147c
16 changed files with 1904 additions and 78 deletions
+137
View File
@@ -0,0 +1,137 @@
package reconcile
import (
"errors"
"sync"
"testing"
"time"
)
// TestQueue_SameGuestSerialized asserts that jobs for one vmid run strictly
// one-at-a-time in submit order — the core §10 guarantee that keeps Proxmox from
// seeing concurrent conflicting ops on a guest.
func TestQueue_SameGuestSerialized(t *testing.T) {
q := NewQueue()
defer q.Close()
const n = 50
var mu sync.Mutex
var order []int
inside := 0
maxConcurrent := 0
chans := make([]<-chan error, n)
for i := 0; i < n; i++ {
i := i
chans[i] = q.Submit(100, func() error {
mu.Lock()
inside++
if inside > maxConcurrent {
maxConcurrent = inside
}
order = append(order, i)
mu.Unlock()
time.Sleep(time.Millisecond) // widen any overlap window
mu.Lock()
inside--
mu.Unlock()
return nil
})
}
for _, ch := range chans {
if err := <-ch; err != nil {
t.Fatalf("unexpected job error: %v", err)
}
}
if maxConcurrent != 1 {
t.Errorf("same-guest jobs overlapped: maxConcurrent=%d, want 1", maxConcurrent)
}
for i := 0; i < n; i++ {
if order[i] != i {
t.Fatalf("same-guest jobs ran out of submit order: got %v", order)
break
}
}
}
// TestQueue_DifferentGuestsParallel asserts independent vmids proceed concurrently:
// two jobs on different lanes that each wait for the other before finishing must BOTH
// complete (they'd deadlock under a global lock / single worker).
func TestQueue_DifferentGuestsParallel(t *testing.T) {
q := NewQueue()
defer q.Close()
aReady := make(chan struct{})
bReady := make(chan struct{})
chA := q.Submit(1, func() error {
close(aReady)
select {
case <-bReady:
return nil
case <-time.After(2 * time.Second):
return errors.New("guest 1 timed out waiting for guest 2 (not parallel)")
}
})
chB := q.Submit(2, func() error {
close(bReady)
select {
case <-aReady:
return nil
case <-time.After(2 * time.Second):
return errors.New("guest 2 timed out waiting for guest 1 (not parallel)")
}
})
if err := <-chA; err != nil {
t.Error(err)
}
if err := <-chB; err != nil {
t.Error(err)
}
}
// TestQueue_PropagatesJobError confirms a job's error reaches its result channel.
func TestQueue_PropagatesJobError(t *testing.T) {
q := NewQueue()
defer q.Close()
want := errors.New("boom")
if got := <-q.Submit(7, func() error { return want }); got != want {
t.Errorf("Submit result = %v, want %v", got, want)
}
}
// TestQueue_DrainsPendingOnClose confirms jobs already queued before Close still run.
func TestQueue_DrainsPendingOnClose(t *testing.T) {
q := NewQueue()
release := make(chan struct{})
var ran sync.WaitGroup
ran.Add(2)
// First job blocks until released, pinning the lane so the second sits pending.
ch1 := q.Submit(5, func() error { <-release; ran.Done(); return nil })
ch2 := q.Submit(5, func() error { ran.Done(); return nil })
q.Close() // close while job1 is queued/running and job2 is pending
close(release)
if err := <-ch1; err != nil {
t.Errorf("job1 err: %v", err)
}
if err := <-ch2; err != nil {
t.Errorf("pending job2 should still run after Close, got: %v", err)
}
ran.Wait()
}
// TestQueue_SubmitAfterClose returns ErrQueueClosed.
func TestQueue_SubmitAfterClose(t *testing.T) {
q := NewQueue()
q.Close()
if got := <-q.Submit(1, func() error { return nil }); got != ErrQueueClosed {
t.Errorf("Submit after Close = %v, want ErrQueueClosed", got)
}
}