diff --git a/CHANGELOG.md b/CHANGELOG.md index c9933e4..3a2d184 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,26 @@ ## Changelog +### v0.38.0 — slice 8B.2: quiesce downtime optimization (resume at `snapshotted`) (2026-06-10) + +The controller half of slice 8B.2. Pairs with `felhom-agent` v0.13.0. The quiesce loop now resumes +the app at the **`snapshotted`** phase (storage snapshot taken) instead of `done` — app downtime +drops from *whole-backup* to *until-snapshot* (seconds), with no loss of app-consistency (the +snapshot froze the app-stopped state). + +#### Changed (`internal/quiesce`) +- The status-poll loop **resumes (`StartStack` + clears the marker) at `snapshotted`**, then **keeps + polling to `done`/`failed`** — so a new backup isn't started until this one truly finishes, and a + post-snapshot failure is observed (the backup isn't "successful" until `done`; resuming early does + not mark it done). +- **Fallback preserved:** if `snapshotted` never arrives (stop/downgraded mode), it resumes at `done` + exactly as 8B. **Crash-safety unchanged:** marker written before stop; guaranteed unquiesce; + startup `Recover()`. A backup that fails *after* `snapshotted` is harmless — the app is already up. + +#### Tests +- resume at `snapshotted` (RESUME event before `done`, marker cleared, then tracked to `done`); + stop-mode fallback (resume at `done`, no `snapshotted`); fail-after-`snapshotted` (one resume, app + stays up); the 8B crash-safety tests stay green. + ### v0.37.0 — slice 8C: controller de-privileging + disk management via the agent (2026-06-10) The in-guest controller half of slice 8C (closes slice 8). The disk-execution subsystem moves to diff --git a/controller/internal/quiesce/quiesce.go b/controller/internal/quiesce/quiesce.go index a212d1d..c792f2b 100644 --- a/controller/internal/quiesce/quiesce.go +++ b/controller/internal/quiesce/quiesce.go @@ -38,8 +38,9 @@ type Stacks interface { // Backup status phases (mirror the agent's vocabulary). const ( - phaseDone = "done" - phaseFailed = "failed" + phaseSnapshotted = "snapshotted" // 8B.2: storage snapshot taken → app may resume early + phaseDone = "done" + phaseFailed = "failed" ) // Marker is the persisted quiesce state — the crash-safety + single-flight record. It is written @@ -200,11 +201,25 @@ func (l *Loop) runOnce(ctx context.Context) error { return fmt.Errorf("poll backup status: %w", err) } switch phase { + case phaseSnapshotted: + // 8B.2: the storage snapshot is taken — the app-stopped state is captured, so the app + // may resume NOW (downtime = until-snapshot, not until-backup-done) with no loss of + // app-consistency. unquiesce is idempotent (fires once); we then KEEP polling to + // done/failed so a new backup isn't started until this one truly finishes (and so a + // post-snapshot failure is observed). The marker is cleared on resume — a crash in this + // tail leaves the app already up, nothing to recover. + if !unquiesced { + l.logger.Printf("[INFO] [quiesce] backup job %s snapshotted — resuming app early (8B.2)", jobID) + unquiesce("snapshotted (early resume)") + } case phaseDone: + // Fallback (stop/downgraded mode never emits snapshotted): resume at done, exactly 8B. l.logger.Printf("[INFO] [quiesce] backup job %s done", jobID) unquiesce("backup done") return nil case phaseFailed: + // If we already resumed at snapshotted, the app is up — just note the backup failed + // (recorded for the agent's due window when it stores the failed result). l.logger.Printf("[WARN] [quiesce] backup job %s failed", jobID) unquiesce("backup failed") return nil diff --git a/controller/internal/quiesce/quiesce_8b2_test.go b/controller/internal/quiesce/quiesce_8b2_test.go new file mode 100644 index 0000000..7c68bda --- /dev/null +++ b/controller/internal/quiesce/quiesce_8b2_test.go @@ -0,0 +1,135 @@ +package quiesce + +import ( + "context" + "io" + "log" + "path/filepath" + "sync" + "testing" + "time" +) + +// eventStacks records an ordered event log (shared with eventBackend) so a test can assert that +// StartStack (resume) happened at the `snapshotted` poll, before `done`. +type eventStacks struct { + mu *sync.Mutex + events *[]string + running []string +} + +func (s *eventStacks) RunningAppStacks() []string { return append([]string(nil), s.running...) } +func (s *eventStacks) StopStack(string) error { return nil } +func (s *eventStacks) StartStack(string) error { + s.mu.Lock() + *s.events = append(*s.events, "RESUME") + s.mu.Unlock() + return nil +} + +type eventBackend struct { + mu *sync.Mutex + events *[]string + phases []string + i int +} + +func (b *eventBackend) Due(context.Context) (bool, error) { return true, nil } +func (b *eventBackend) StartBackup(context.Context) (string, error) { return "job-1", nil } +func (b *eventBackend) BackupStatus(context.Context) (string, error) { + ph := b.phases[len(b.phases)-1] + if b.i < len(b.phases) { + ph = b.phases[b.i] + b.i++ + } + b.mu.Lock() + *b.events = append(*b.events, ph) + b.mu.Unlock() + return ph, nil +} + +func eventLoop(t *testing.T, phases []string) (*Loop, *[]string, *eventStacks) { + var mu sync.Mutex + events := &[]string{} + st := &eventStacks{mu: &mu, events: events, running: []string{"pgapp"}} + be := &eventBackend{mu: &mu, events: events, phases: phases} + l := New(Options{ + Backend: be, Stacks: st, + MarkerPath: filepath.Join(t.TempDir(), "q.json"), + Poll: time.Hour, StatusPoll: time.Millisecond, MaxQuiesce: 5 * time.Second, + Logger: log.New(io.Discard, "", 0), + }) + return l, events, st +} + +// 8B.2: resume at `snapshotted` (RESUME before `done`), then keep tracking to `done`; marker cleared. +func TestRunOnce_ResumesAtSnapshotted(t *testing.T) { + l, events, _ := eventLoop(t, []string{"running", "snapshotted", "running", "done"}) + if err := l.runOnce(context.Background()); err != nil { + t.Fatal(err) + } + // RESUME must appear and must come BEFORE the first "done". + resumeIdx, doneIdx := -1, -1 + for i, e := range *events { + if e == "RESUME" && resumeIdx < 0 { + resumeIdx = i + } + if e == "done" && doneIdx < 0 { + doneIdx = i + } + } + if resumeIdx < 0 { + t.Fatalf("never resumed: %v", *events) + } + if doneIdx < 0 { + t.Fatalf("never tracked to done (must keep polling after early resume): %v", *events) + } + if resumeIdx > doneIdx { + t.Fatalf("resumed at/after done, not at snapshotted: %v", *events) + } + // the event right before RESUME should be a snapshotted poll + if (*events)[resumeIdx-1] != "snapshotted" { + t.Fatalf("resume not triggered by snapshotted: %v", *events) + } + if _, ok := l.readMarker(); ok { + t.Fatal("marker not cleared after resume") + } +} + +// Fallback: stop mode (never snapshotted) → resume at `done` (8B behavior). +func TestRunOnce_FallbackResumeAtDone(t *testing.T) { + l, events, _ := eventLoop(t, []string{"running", "running", "done"}) + if err := l.runOnce(context.Background()); err != nil { + t.Fatal(err) + } + // RESUME comes only at/after done (no snapshotted in the stream). + for _, e := range *events { + if e == "snapshotted" { + t.Fatal("snapshotted appeared in stop-mode stream") + } + } + last := (*events)[len(*events)-1] + if last != "RESUME" && last != "done" { + t.Fatalf("expected resume at done: %v", *events) + } +} + +// A backup that FAILS after snapshotted: the app is already up (resumed once), the cycle ends. +func TestRunOnce_FailAfterSnapshotted_AppStaysUp(t *testing.T) { + l, events, _ := eventLoop(t, []string{"snapshotted", "failed"}) + if err := l.runOnce(context.Background()); err != nil { + t.Fatal(err) + } + resumes := 0 + for _, e := range *events { + if e == "RESUME" { + resumes++ + } + } + if resumes != 1 { + t.Fatalf("expected exactly one resume (at snapshotted), got %d: %v", resumes, *events) + } + if _, ok := l.readMarker(); ok { + t.Fatal("marker not cleared") + } +}