v0.4.0-rc1: slice 4 Phase A — reconcile engine (structural, runs live unfed)

New internal/reconcile package: the agent-side control core's structural half. - Per-guest serializer Queue (doc 03 §10): the single choke point all mutation sources funnel through; same-vmid serial in submit order, different vmids parallel (cond-var FIFO lanes). - Desired-state model + DesiredProvider seam; EmptyProvider is the only live source at slice 4 (no hub serving until slice 10) so the live engine computes an empty action set and performs zero mutations. - Normalization layer (FieldNormalizers): normalized desired-vs-actual so Proxmox round-trip quirks don't read as drift. normDesc promoted out of main.go to reconcile.NormDescription; selftest uses the shared helper. - Plan (pure diff): minimal benign action set (Start/Stop/SetConfig) for guests in both desired and actual; provision/destroy out of scope here. - Engine: dispatches onto the shared queue; honors the dual-mode SetConfig contract (UPID -> WaitTask; empty UPID -> synchronous success). - Durable op journal + idempotency store (mirrors authz.FileNonceStore): in-flight task ids for crash detection + AlreadyApplied dedupe across restart. - Wired into runDaemon alongside the hub loop, sharing the queue; runs cleanly with no desired state and no signers. Full module race-clean and vet-clean on the Linux build server. CHECKPOINT: Phase A only. Awaiting validation before Phase B (the reversibility gate + signed-op consuming layer, landing v0.4.0). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-08 23:21:55 +02:00
parent 605ce25f58
commit 05c450147c
16 changed files with 1904 additions and 78 deletions
@@ -0,0 +1,137 @@
+package reconcile
+
+import (
+	"errors"
+	"sync"
+	"testing"
+	"time"
+)
+
+// TestQueue_SameGuestSerialized asserts that jobs for one vmid run strictly
+// one-at-a-time in submit order — the core §10 guarantee that keeps Proxmox from
+// seeing concurrent conflicting ops on a guest.
+func TestQueue_SameGuestSerialized(t *testing.T) {
+	q := NewQueue()
+	defer q.Close()
+
+	const n = 50
+	var mu sync.Mutex
+	var order []int
+	inside := 0
+	maxConcurrent := 0
+
+	chans := make([]<-chan error, n)
+	for i := 0; i < n; i++ {
+		i := i
+		chans[i] = q.Submit(100, func() error {
+			mu.Lock()
+			inside++
+			if inside > maxConcurrent {
+				maxConcurrent = inside
+			}
+			order = append(order, i)
+			mu.Unlock()
+
+			time.Sleep(time.Millisecond) // widen any overlap window
+
+			mu.Lock()
+			inside--
+			mu.Unlock()
+			return nil
+		})
+	}
+	for _, ch := range chans {
+		if err := <-ch; err != nil {
+			t.Fatalf("unexpected job error: %v", err)
+		}
+	}
+
+	if maxConcurrent != 1 {
+		t.Errorf("same-guest jobs overlapped: maxConcurrent=%d, want 1", maxConcurrent)
+	}
+	for i := 0; i < n; i++ {
+		if order[i] != i {
+			t.Fatalf("same-guest jobs ran out of submit order: got %v", order)
+			break
+		}
+	}
+}
+
+// TestQueue_DifferentGuestsParallel asserts independent vmids proceed concurrently:
+// two jobs on different lanes that each wait for the other before finishing must BOTH
+// complete (they'd deadlock under a global lock / single worker).
+func TestQueue_DifferentGuestsParallel(t *testing.T) {
+	q := NewQueue()
+	defer q.Close()
+
+	aReady := make(chan struct{})
+	bReady := make(chan struct{})
+
+	chA := q.Submit(1, func() error {
+		close(aReady)
+		select {
+		case <-bReady:
+			return nil
+		case <-time.After(2 * time.Second):
+			return errors.New("guest 1 timed out waiting for guest 2 (not parallel)")
+		}
+	})
+	chB := q.Submit(2, func() error {
+		close(bReady)
+		select {
+		case <-aReady:
+			return nil
+		case <-time.After(2 * time.Second):
+			return errors.New("guest 2 timed out waiting for guest 1 (not parallel)")
+		}
+	})
+
+	if err := <-chA; err != nil {
+		t.Error(err)
+	}
+	if err := <-chB; err != nil {
+		t.Error(err)
+	}
+}
+
+// TestQueue_PropagatesJobError confirms a job's error reaches its result channel.
+func TestQueue_PropagatesJobError(t *testing.T) {
+	q := NewQueue()
+	defer q.Close()
+	want := errors.New("boom")
+	if got := <-q.Submit(7, func() error { return want }); got != want {
+		t.Errorf("Submit result = %v, want %v", got, want)
+	}
+}
+
+// TestQueue_DrainsPendingOnClose confirms jobs already queued before Close still run.
+func TestQueue_DrainsPendingOnClose(t *testing.T) {
+	q := NewQueue()
+	release := make(chan struct{})
+	var ran sync.WaitGroup
+	ran.Add(2)
+
+	// First job blocks until released, pinning the lane so the second sits pending.
+	ch1 := q.Submit(5, func() error { <-release; ran.Done(); return nil })
+	ch2 := q.Submit(5, func() error { ran.Done(); return nil })
+
+	q.Close() // close while job1 is queued/running and job2 is pending
+	close(release)
+
+	if err := <-ch1; err != nil {
+		t.Errorf("job1 err: %v", err)
+	}
+	if err := <-ch2; err != nil {
+		t.Errorf("pending job2 should still run after Close, got: %v", err)
+	}
+	ran.Wait()
+}
+
+// TestQueue_SubmitAfterClose returns ErrQueueClosed.
+func TestQueue_SubmitAfterClose(t *testing.T) {
+	q := NewQueue()
+	q.Close()
+	if got := <-q.Submit(1, func() error { return nil }); got != ErrQueueClosed {
+		t.Errorf("Submit after Close = %v, want ErrQueueClosed", got)
+	}
+}