slice 8B.2 (controller): resume app at snapshotted, keep tracking to done (v0.38.0)
Quiesce loop resumes (StartStack + clear marker) at the snapshotted phase instead of done -> downtime whole-backup -> until-snapshot, no consistency loss. Keeps polling to done/failed (no overlapping backup; post-snapshot failure observed). Stop-mode fallback to done + crash-safety preserved. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,26 @@
|
|||||||
## Changelog
|
## Changelog
|
||||||
|
|
||||||
|
### v0.38.0 — slice 8B.2: quiesce downtime optimization (resume at `snapshotted`) (2026-06-10)
|
||||||
|
|
||||||
|
The controller half of slice 8B.2. Pairs with `felhom-agent` v0.13.0. The quiesce loop now resumes
|
||||||
|
the app at the **`snapshotted`** phase (storage snapshot taken) instead of `done` — app downtime
|
||||||
|
drops from *whole-backup* to *until-snapshot* (seconds), with no loss of app-consistency (the
|
||||||
|
snapshot froze the app-stopped state).
|
||||||
|
|
||||||
|
#### Changed (`internal/quiesce`)
|
||||||
|
- The status-poll loop **resumes (`StartStack` + clears the marker) at `snapshotted`**, then **keeps
|
||||||
|
polling to `done`/`failed`** — so a new backup isn't started until this one truly finishes, and a
|
||||||
|
post-snapshot failure is observed (the backup isn't "successful" until `done`; resuming early does
|
||||||
|
not mark it done).
|
||||||
|
- **Fallback preserved:** if `snapshotted` never arrives (stop/downgraded mode), it resumes at `done`
|
||||||
|
exactly as 8B. **Crash-safety unchanged:** marker written before stop; guaranteed unquiesce;
|
||||||
|
startup `Recover()`. A backup that fails *after* `snapshotted` is harmless — the app is already up.
|
||||||
|
|
||||||
|
#### Tests
|
||||||
|
- resume at `snapshotted` (RESUME event before `done`, marker cleared, then tracked to `done`);
|
||||||
|
stop-mode fallback (resume at `done`, no `snapshotted`); fail-after-`snapshotted` (one resume, app
|
||||||
|
stays up); the 8B crash-safety tests stay green.
|
||||||
|
|
||||||
### v0.37.0 — slice 8C: controller de-privileging + disk management via the agent (2026-06-10)
|
### v0.37.0 — slice 8C: controller de-privileging + disk management via the agent (2026-06-10)
|
||||||
|
|
||||||
The in-guest controller half of slice 8C (closes slice 8). The disk-execution subsystem moves to
|
The in-guest controller half of slice 8C (closes slice 8). The disk-execution subsystem moves to
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ type Stacks interface {
|
|||||||
|
|
||||||
// Backup status phases (mirror the agent's vocabulary).
|
// Backup status phases (mirror the agent's vocabulary).
|
||||||
const (
|
const (
|
||||||
|
phaseSnapshotted = "snapshotted" // 8B.2: storage snapshot taken → app may resume early
|
||||||
phaseDone = "done"
|
phaseDone = "done"
|
||||||
phaseFailed = "failed"
|
phaseFailed = "failed"
|
||||||
)
|
)
|
||||||
@@ -200,11 +201,25 @@ func (l *Loop) runOnce(ctx context.Context) error {
|
|||||||
return fmt.Errorf("poll backup status: %w", err)
|
return fmt.Errorf("poll backup status: %w", err)
|
||||||
}
|
}
|
||||||
switch phase {
|
switch phase {
|
||||||
|
case phaseSnapshotted:
|
||||||
|
// 8B.2: the storage snapshot is taken — the app-stopped state is captured, so the app
|
||||||
|
// may resume NOW (downtime = until-snapshot, not until-backup-done) with no loss of
|
||||||
|
// app-consistency. unquiesce is idempotent (fires once); we then KEEP polling to
|
||||||
|
// done/failed so a new backup isn't started until this one truly finishes (and so a
|
||||||
|
// post-snapshot failure is observed). The marker is cleared on resume — a crash in this
|
||||||
|
// tail leaves the app already up, nothing to recover.
|
||||||
|
if !unquiesced {
|
||||||
|
l.logger.Printf("[INFO] [quiesce] backup job %s snapshotted — resuming app early (8B.2)", jobID)
|
||||||
|
unquiesce("snapshotted (early resume)")
|
||||||
|
}
|
||||||
case phaseDone:
|
case phaseDone:
|
||||||
|
// Fallback (stop/downgraded mode never emits snapshotted): resume at done, exactly 8B.
|
||||||
l.logger.Printf("[INFO] [quiesce] backup job %s done", jobID)
|
l.logger.Printf("[INFO] [quiesce] backup job %s done", jobID)
|
||||||
unquiesce("backup done")
|
unquiesce("backup done")
|
||||||
return nil
|
return nil
|
||||||
case phaseFailed:
|
case phaseFailed:
|
||||||
|
// If we already resumed at snapshotted, the app is up — just note the backup failed
|
||||||
|
// (recorded for the agent's due window when it stores the failed result).
|
||||||
l.logger.Printf("[WARN] [quiesce] backup job %s failed", jobID)
|
l.logger.Printf("[WARN] [quiesce] backup job %s failed", jobID)
|
||||||
unquiesce("backup failed")
|
unquiesce("backup failed")
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -0,0 +1,135 @@
|
|||||||
|
package quiesce
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"io"
|
||||||
|
"log"
|
||||||
|
"path/filepath"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// eventStacks records an ordered event log (shared with eventBackend) so a test can assert that
|
||||||
|
// StartStack (resume) happened at the `snapshotted` poll, before `done`.
|
||||||
|
type eventStacks struct {
|
||||||
|
mu *sync.Mutex
|
||||||
|
events *[]string
|
||||||
|
running []string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *eventStacks) RunningAppStacks() []string { return append([]string(nil), s.running...) }
|
||||||
|
func (s *eventStacks) StopStack(string) error { return nil }
|
||||||
|
func (s *eventStacks) StartStack(string) error {
|
||||||
|
s.mu.Lock()
|
||||||
|
*s.events = append(*s.events, "RESUME")
|
||||||
|
s.mu.Unlock()
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type eventBackend struct {
|
||||||
|
mu *sync.Mutex
|
||||||
|
events *[]string
|
||||||
|
phases []string
|
||||||
|
i int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *eventBackend) Due(context.Context) (bool, error) { return true, nil }
|
||||||
|
func (b *eventBackend) StartBackup(context.Context) (string, error) { return "job-1", nil }
|
||||||
|
func (b *eventBackend) BackupStatus(context.Context) (string, error) {
|
||||||
|
ph := b.phases[len(b.phases)-1]
|
||||||
|
if b.i < len(b.phases) {
|
||||||
|
ph = b.phases[b.i]
|
||||||
|
b.i++
|
||||||
|
}
|
||||||
|
b.mu.Lock()
|
||||||
|
*b.events = append(*b.events, ph)
|
||||||
|
b.mu.Unlock()
|
||||||
|
return ph, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func eventLoop(t *testing.T, phases []string) (*Loop, *[]string, *eventStacks) {
|
||||||
|
var mu sync.Mutex
|
||||||
|
events := &[]string{}
|
||||||
|
st := &eventStacks{mu: &mu, events: events, running: []string{"pgapp"}}
|
||||||
|
be := &eventBackend{mu: &mu, events: events, phases: phases}
|
||||||
|
l := New(Options{
|
||||||
|
Backend: be, Stacks: st,
|
||||||
|
MarkerPath: filepath.Join(t.TempDir(), "q.json"),
|
||||||
|
Poll: time.Hour, StatusPoll: time.Millisecond, MaxQuiesce: 5 * time.Second,
|
||||||
|
Logger: log.New(io.Discard, "", 0),
|
||||||
|
})
|
||||||
|
return l, events, st
|
||||||
|
}
|
||||||
|
|
||||||
|
// 8B.2: resume at `snapshotted` (RESUME before `done`), then keep tracking to `done`; marker cleared.
|
||||||
|
func TestRunOnce_ResumesAtSnapshotted(t *testing.T) {
|
||||||
|
l, events, _ := eventLoop(t, []string{"running", "snapshotted", "running", "done"})
|
||||||
|
if err := l.runOnce(context.Background()); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
// RESUME must appear and must come BEFORE the first "done".
|
||||||
|
resumeIdx, doneIdx := -1, -1
|
||||||
|
for i, e := range *events {
|
||||||
|
if e == "RESUME" && resumeIdx < 0 {
|
||||||
|
resumeIdx = i
|
||||||
|
}
|
||||||
|
if e == "done" && doneIdx < 0 {
|
||||||
|
doneIdx = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if resumeIdx < 0 {
|
||||||
|
t.Fatalf("never resumed: %v", *events)
|
||||||
|
}
|
||||||
|
if doneIdx < 0 {
|
||||||
|
t.Fatalf("never tracked to done (must keep polling after early resume): %v", *events)
|
||||||
|
}
|
||||||
|
if resumeIdx > doneIdx {
|
||||||
|
t.Fatalf("resumed at/after done, not at snapshotted: %v", *events)
|
||||||
|
}
|
||||||
|
// the event right before RESUME should be a snapshotted poll
|
||||||
|
if (*events)[resumeIdx-1] != "snapshotted" {
|
||||||
|
t.Fatalf("resume not triggered by snapshotted: %v", *events)
|
||||||
|
}
|
||||||
|
if _, ok := l.readMarker(); ok {
|
||||||
|
t.Fatal("marker not cleared after resume")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: stop mode (never snapshotted) → resume at `done` (8B behavior).
|
||||||
|
func TestRunOnce_FallbackResumeAtDone(t *testing.T) {
|
||||||
|
l, events, _ := eventLoop(t, []string{"running", "running", "done"})
|
||||||
|
if err := l.runOnce(context.Background()); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
// RESUME comes only at/after done (no snapshotted in the stream).
|
||||||
|
for _, e := range *events {
|
||||||
|
if e == "snapshotted" {
|
||||||
|
t.Fatal("snapshotted appeared in stop-mode stream")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
last := (*events)[len(*events)-1]
|
||||||
|
if last != "RESUME" && last != "done" {
|
||||||
|
t.Fatalf("expected resume at done: %v", *events)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// A backup that FAILS after snapshotted: the app is already up (resumed once), the cycle ends.
|
||||||
|
func TestRunOnce_FailAfterSnapshotted_AppStaysUp(t *testing.T) {
|
||||||
|
l, events, _ := eventLoop(t, []string{"snapshotted", "failed"})
|
||||||
|
if err := l.runOnce(context.Background()); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
resumes := 0
|
||||||
|
for _, e := range *events {
|
||||||
|
if e == "RESUME" {
|
||||||
|
resumes++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if resumes != 1 {
|
||||||
|
t.Fatalf("expected exactly one resume (at snapshotted), got %d: %v", resumes, *events)
|
||||||
|
}
|
||||||
|
if _, ok := l.readMarker(); ok {
|
||||||
|
t.Fatal("marker not cleared")
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user