package quiesce import ( "context" "io" "log" "os" "path/filepath" "sync" "testing" "time" ) // fakeStacks records stop/start calls in order. type fakeStacks struct { mu sync.Mutex running []string stopped []string started []string stopErr map[string]error } func (f *fakeStacks) RunningAppStacks() []string { f.mu.Lock() defer f.mu.Unlock() return append([]string(nil), f.running...) } func (f *fakeStacks) StopStack(name string) error { f.mu.Lock() defer f.mu.Unlock() f.stopped = append(f.stopped, name) if f.stopErr != nil { return f.stopErr[name] } return nil } func (f *fakeStacks) StartStack(name string) error { f.mu.Lock() defer f.mu.Unlock() f.started = append(f.started, name) return nil } func (f *fakeStacks) startedNames() []string { f.mu.Lock() defer f.mu.Unlock() return append([]string(nil), f.started...) } func (f *fakeStacks) stoppedNames() []string { f.mu.Lock() defer f.mu.Unlock() return append([]string(nil), f.stopped...) } // fakeBackend drives the agent-side responses. type fakeBackend struct { due bool dueErr error startErr error jobID string phases []string // returned in sequence by BackupStatus; last value repeats statusErr error startCalls int statusCalls int mu sync.Mutex } func (b *fakeBackend) Due(context.Context) (bool, error) { return b.due, b.dueErr } func (b *fakeBackend) StartBackup(context.Context) (string, error) { b.mu.Lock() b.startCalls++ b.mu.Unlock() if b.startErr != nil { return "", b.startErr } if b.jobID == "" { b.jobID = "job-1" } return b.jobID, nil } func (b *fakeBackend) BackupStatus(context.Context) (string, error) { if b.statusErr != nil { return "", b.statusErr } b.mu.Lock() defer b.mu.Unlock() i := b.statusCalls b.statusCalls++ if i >= len(b.phases) { if len(b.phases) == 0 { return PhaseRunning, nil } return b.phases[len(b.phases)-1], nil } return b.phases[i], nil } const PhaseRunning = "running" // local alias for readability in tests func testLoop(t *testing.T, be Backend, st Stacks) *Loop { t.Helper() l := New(Options{ Backend: be, Stacks: st, MarkerPath: filepath.Join(t.TempDir(), "quiesce-state.json"), Poll: time.Hour, StatusPoll: time.Millisecond, MaxQuiesce: 5 * time.Second, Logger: log.New(io.Discard, "", 0), }) return l } // Happy path: due → stop running stacks → start backup → poll to done → restart exactly those → clear marker. func TestRunOnce_HappyPath(t *testing.T) { be := &fakeBackend{due: true, phases: []string{"running", "running", "done"}} st := &fakeStacks{running: []string{"nextcloud", "vaultwarden"}} l := testLoop(t, be, st) if err := l.runOnce(context.Background()); err != nil { t.Fatalf("runOnce: %v", err) } if got := st.stoppedNames(); len(got) != 2 || got[0] != "nextcloud" || got[1] != "vaultwarden" { t.Fatalf("stopped wrong/order: %v", got) } if got := st.startedNames(); len(got) != 2 || got[0] != "nextcloud" || got[1] != "vaultwarden" { t.Fatalf("started wrong/order: %v", got) } if be.startCalls != 1 { t.Fatalf("expected 1 backup, got %d", be.startCalls) } if _, ok := l.readMarker(); ok { t.Fatal("marker not cleared after a successful cycle") } } // Not due → nothing happens. func TestRunOnce_NotDue(t *testing.T) { be := &fakeBackend{due: false} st := &fakeStacks{running: []string{"a"}} l := testLoop(t, be, st) if err := l.runOnce(context.Background()); err != nil { t.Fatal(err) } if len(st.stoppedNames()) != 0 || be.startCalls != 0 { t.Fatal("acted while not due") } } // Backup START fails → stacks STILL restarted (guaranteed unquiesce). func TestRunOnce_BackupStartFails_StillRestarts(t *testing.T) { be := &fakeBackend{due: true, startErr: errString("boom")} st := &fakeStacks{running: []string{"a", "b"}} l := testLoop(t, be, st) _ = l.runOnce(context.Background()) if got := st.startedNames(); len(got) != 2 { t.Fatalf("stacks not restarted after a backup-start failure: %v", got) } if _, ok := l.readMarker(); ok { t.Fatal("marker not cleared after a failed backup") } } // Backup reports failed → stacks restarted. func TestRunOnce_BackupFailedPhase_Restarts(t *testing.T) { be := &fakeBackend{due: true, phases: []string{"running", "failed"}} st := &fakeStacks{running: []string{"a"}} l := testLoop(t, be, st) _ = l.runOnce(context.Background()) if got := st.startedNames(); len(got) != 1 || got[0] != "a" { t.Fatalf("not restarted after failed phase: %v", got) } } // Max-quiesce guard: status never reaches done → stacks restarted at the bound. func TestRunOnce_MaxQuiesceGuard(t *testing.T) { be := &fakeBackend{due: true, phases: []string{"running"}} // never done st := &fakeStacks{running: []string{"a", "b"}} l := testLoop(t, be, st) // shrink the bound + use a controllable clock so the guard fires fast base := time.Now() steps := 0 l.now = func() time.Time { steps++ return base.Add(time.Duration(steps) * time.Minute) // each call advances 1m } l.maxQuiesce = 2 * time.Minute done := make(chan error, 1) go func() { done <- l.runOnce(context.Background()) }() select { case err := <-done: if err != nil { t.Fatalf("runOnce returned error: %v", err) } case <-time.After(2 * time.Second): t.Fatal("runOnce did not return — max-quiesce guard did not fire") } if got := st.startedNames(); len(got) != 2 { t.Fatalf("stacks not restarted at the max-quiesce bound: %v", got) } if _, ok := l.readMarker(); ok { t.Fatal("marker not cleared after the guard fired") } } // Crash recovery: a marker present at startup → recorded stacks restarted, marker cleared. func TestRecover_RestartsFromMarker(t *testing.T) { st := &fakeStacks{} l := testLoop(t, &fakeBackend{}, st) // simulate a crash mid-quiesce: an active marker with stopped stacks if err := l.writeMarker(Marker{Active: true, StoppedStacks: []string{"nextcloud", "immich"}, JobID: "job-x"}); err != nil { t.Fatal(err) } l.Recover() if got := st.startedNames(); len(got) != 2 || got[0] != "nextcloud" || got[1] != "immich" { t.Fatalf("recovery did not restart the recorded stacks: %v", got) } if _, ok := l.readMarker(); ok { t.Fatal("recovery did not clear the marker") } } // Recover with no marker is a no-op. func TestRecover_NoMarker(t *testing.T) { st := &fakeStacks{} l := testLoop(t, &fakeBackend{}, st) l.Recover() if len(st.startedNames()) != 0 { t.Fatal("recovery restarted stacks with no marker present") } } // Single-flight: a cycle that begins with an active marker present is a no-op (no second backup). func TestRunOnce_SingleFlight(t *testing.T) { be := &fakeBackend{due: true, phases: []string{"done"}} st := &fakeStacks{running: []string{"a"}} l := testLoop(t, be, st) if err := l.writeMarker(Marker{Active: true, StoppedStacks: []string{"a"}}); err != nil { t.Fatal(err) } if err := l.runOnce(context.Background()); err != nil { t.Fatal(err) } if be.startCalls != 0 { t.Fatal("started a backup while a marker was already active") } } // Only the stacks we stopped are restarted: an already-stopped stack is not in RunningAppStacks, // so unquiesce never starts it. func TestRunOnce_OnlyRestartsWhatWeStopped(t *testing.T) { // "db" was already stopped before quiesce → not in running → not restarted. be := &fakeBackend{due: true, phases: []string{"done"}} st := &fakeStacks{running: []string{"web"}} // only web is up l := testLoop(t, be, st) if err := l.runOnce(context.Background()); err != nil { t.Fatal(err) } for _, s := range st.startedNames() { if s == "db" { t.Fatal("restarted a stack that was already stopped before quiesce") } } if got := st.startedNames(); len(got) != 1 || got[0] != "web" { t.Fatalf("expected only web restarted, got %v", got) } } // Marker is written BEFORE stacks are stopped (crash-safety ordering): if stop is observed, the // marker must already exist on disk. func TestRunOnce_MarkerWrittenBeforeStop(t *testing.T) { st := &fakeStacks{running: []string{"a"}} l := testLoop(t, &fakeBackend{due: true, phases: []string{"done"}}, st) // Wrap StopStack via a stacks decorator that checks the marker file exists at stop time. markerSeen := false dec := &stopObserver{inner: st, onStop: func() { if _, err := os.Stat(l.markerPath); err == nil { markerSeen = true } }} l.stacks = dec if err := l.runOnce(context.Background()); err != nil { t.Fatal(err) } if !markerSeen { t.Fatal("marker was not on disk when the first stack was stopped (crash-safety ordering violated)") } } type stopObserver struct { inner Stacks onStop func() } func (s *stopObserver) RunningAppStacks() []string { return s.inner.RunningAppStacks() } func (s *stopObserver) StopStack(n string) error { s.onStop(); return s.inner.StopStack(n) } func (s *stopObserver) StartStack(n string) error { return s.inner.StartStack(n) } type errString string func (e errString) Error() string { return string(e) }