68fc153d9c
internal/quiesce: poll /backup/due -> quiesce (stop app stacks) -> POST /backup -> poll /backup/status -> unquiesce (restart exactly those). Crash-safety: persisted marker before stopping, guaranteed unquiesce (defer), max-quiesce guard, startup Recover, single-flight. agentapi BackupDue/StartBackup/ BackupStatus; stacks.RunningAppStacks(); config QuiesceConfig; main wiring. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
303 lines
8.8 KiB
Go
303 lines
8.8 KiB
Go
package quiesce
|
|
|
|
import (
|
|
"context"
|
|
"io"
|
|
"log"
|
|
"os"
|
|
"path/filepath"
|
|
"sync"
|
|
"testing"
|
|
"time"
|
|
)
|
|
|
|
// fakeStacks records stop/start calls in order.
|
|
type fakeStacks struct {
|
|
mu sync.Mutex
|
|
running []string
|
|
stopped []string
|
|
started []string
|
|
stopErr map[string]error
|
|
}
|
|
|
|
func (f *fakeStacks) RunningAppStacks() []string {
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
return append([]string(nil), f.running...)
|
|
}
|
|
func (f *fakeStacks) StopStack(name string) error {
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
f.stopped = append(f.stopped, name)
|
|
if f.stopErr != nil {
|
|
return f.stopErr[name]
|
|
}
|
|
return nil
|
|
}
|
|
func (f *fakeStacks) StartStack(name string) error {
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
f.started = append(f.started, name)
|
|
return nil
|
|
}
|
|
func (f *fakeStacks) startedNames() []string {
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
return append([]string(nil), f.started...)
|
|
}
|
|
func (f *fakeStacks) stoppedNames() []string {
|
|
f.mu.Lock()
|
|
defer f.mu.Unlock()
|
|
return append([]string(nil), f.stopped...)
|
|
}
|
|
|
|
// fakeBackend drives the agent-side responses.
|
|
type fakeBackend struct {
|
|
due bool
|
|
dueErr error
|
|
startErr error
|
|
jobID string
|
|
phases []string // returned in sequence by BackupStatus; last value repeats
|
|
statusErr error
|
|
startCalls int
|
|
statusCalls int
|
|
mu sync.Mutex
|
|
}
|
|
|
|
func (b *fakeBackend) Due(context.Context) (bool, error) { return b.due, b.dueErr }
|
|
func (b *fakeBackend) StartBackup(context.Context) (string, error) {
|
|
b.mu.Lock()
|
|
b.startCalls++
|
|
b.mu.Unlock()
|
|
if b.startErr != nil {
|
|
return "", b.startErr
|
|
}
|
|
if b.jobID == "" {
|
|
b.jobID = "job-1"
|
|
}
|
|
return b.jobID, nil
|
|
}
|
|
func (b *fakeBackend) BackupStatus(context.Context) (string, error) {
|
|
if b.statusErr != nil {
|
|
return "", b.statusErr
|
|
}
|
|
b.mu.Lock()
|
|
defer b.mu.Unlock()
|
|
i := b.statusCalls
|
|
b.statusCalls++
|
|
if i >= len(b.phases) {
|
|
if len(b.phases) == 0 {
|
|
return PhaseRunning, nil
|
|
}
|
|
return b.phases[len(b.phases)-1], nil
|
|
}
|
|
return b.phases[i], nil
|
|
}
|
|
|
|
const PhaseRunning = "running" // local alias for readability in tests
|
|
|
|
func testLoop(t *testing.T, be Backend, st Stacks) *Loop {
|
|
t.Helper()
|
|
l := New(Options{
|
|
Backend: be,
|
|
Stacks: st,
|
|
MarkerPath: filepath.Join(t.TempDir(), "quiesce-state.json"),
|
|
Poll: time.Hour,
|
|
StatusPoll: time.Millisecond,
|
|
MaxQuiesce: 5 * time.Second,
|
|
Logger: log.New(io.Discard, "", 0),
|
|
})
|
|
return l
|
|
}
|
|
|
|
// Happy path: due → stop running stacks → start backup → poll to done → restart exactly those → clear marker.
|
|
func TestRunOnce_HappyPath(t *testing.T) {
|
|
be := &fakeBackend{due: true, phases: []string{"running", "running", "done"}}
|
|
st := &fakeStacks{running: []string{"nextcloud", "vaultwarden"}}
|
|
l := testLoop(t, be, st)
|
|
|
|
if err := l.runOnce(context.Background()); err != nil {
|
|
t.Fatalf("runOnce: %v", err)
|
|
}
|
|
if got := st.stoppedNames(); len(got) != 2 || got[0] != "nextcloud" || got[1] != "vaultwarden" {
|
|
t.Fatalf("stopped wrong/order: %v", got)
|
|
}
|
|
if got := st.startedNames(); len(got) != 2 || got[0] != "nextcloud" || got[1] != "vaultwarden" {
|
|
t.Fatalf("started wrong/order: %v", got)
|
|
}
|
|
if be.startCalls != 1 {
|
|
t.Fatalf("expected 1 backup, got %d", be.startCalls)
|
|
}
|
|
if _, ok := l.readMarker(); ok {
|
|
t.Fatal("marker not cleared after a successful cycle")
|
|
}
|
|
}
|
|
|
|
// Not due → nothing happens.
|
|
func TestRunOnce_NotDue(t *testing.T) {
|
|
be := &fakeBackend{due: false}
|
|
st := &fakeStacks{running: []string{"a"}}
|
|
l := testLoop(t, be, st)
|
|
if err := l.runOnce(context.Background()); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if len(st.stoppedNames()) != 0 || be.startCalls != 0 {
|
|
t.Fatal("acted while not due")
|
|
}
|
|
}
|
|
|
|
// Backup START fails → stacks STILL restarted (guaranteed unquiesce).
|
|
func TestRunOnce_BackupStartFails_StillRestarts(t *testing.T) {
|
|
be := &fakeBackend{due: true, startErr: errString("boom")}
|
|
st := &fakeStacks{running: []string{"a", "b"}}
|
|
l := testLoop(t, be, st)
|
|
|
|
_ = l.runOnce(context.Background())
|
|
if got := st.startedNames(); len(got) != 2 {
|
|
t.Fatalf("stacks not restarted after a backup-start failure: %v", got)
|
|
}
|
|
if _, ok := l.readMarker(); ok {
|
|
t.Fatal("marker not cleared after a failed backup")
|
|
}
|
|
}
|
|
|
|
// Backup reports failed → stacks restarted.
|
|
func TestRunOnce_BackupFailedPhase_Restarts(t *testing.T) {
|
|
be := &fakeBackend{due: true, phases: []string{"running", "failed"}}
|
|
st := &fakeStacks{running: []string{"a"}}
|
|
l := testLoop(t, be, st)
|
|
_ = l.runOnce(context.Background())
|
|
if got := st.startedNames(); len(got) != 1 || got[0] != "a" {
|
|
t.Fatalf("not restarted after failed phase: %v", got)
|
|
}
|
|
}
|
|
|
|
// Max-quiesce guard: status never reaches done → stacks restarted at the bound.
|
|
func TestRunOnce_MaxQuiesceGuard(t *testing.T) {
|
|
be := &fakeBackend{due: true, phases: []string{"running"}} // never done
|
|
st := &fakeStacks{running: []string{"a", "b"}}
|
|
l := testLoop(t, be, st)
|
|
// shrink the bound + use a controllable clock so the guard fires fast
|
|
base := time.Now()
|
|
steps := 0
|
|
l.now = func() time.Time {
|
|
steps++
|
|
return base.Add(time.Duration(steps) * time.Minute) // each call advances 1m
|
|
}
|
|
l.maxQuiesce = 2 * time.Minute
|
|
|
|
done := make(chan error, 1)
|
|
go func() { done <- l.runOnce(context.Background()) }()
|
|
select {
|
|
case err := <-done:
|
|
if err != nil {
|
|
t.Fatalf("runOnce returned error: %v", err)
|
|
}
|
|
case <-time.After(2 * time.Second):
|
|
t.Fatal("runOnce did not return — max-quiesce guard did not fire")
|
|
}
|
|
if got := st.startedNames(); len(got) != 2 {
|
|
t.Fatalf("stacks not restarted at the max-quiesce bound: %v", got)
|
|
}
|
|
if _, ok := l.readMarker(); ok {
|
|
t.Fatal("marker not cleared after the guard fired")
|
|
}
|
|
}
|
|
|
|
// Crash recovery: a marker present at startup → recorded stacks restarted, marker cleared.
|
|
func TestRecover_RestartsFromMarker(t *testing.T) {
|
|
st := &fakeStacks{}
|
|
l := testLoop(t, &fakeBackend{}, st)
|
|
// simulate a crash mid-quiesce: an active marker with stopped stacks
|
|
if err := l.writeMarker(Marker{Active: true, StoppedStacks: []string{"nextcloud", "immich"}, JobID: "job-x"}); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
l.Recover()
|
|
if got := st.startedNames(); len(got) != 2 || got[0] != "nextcloud" || got[1] != "immich" {
|
|
t.Fatalf("recovery did not restart the recorded stacks: %v", got)
|
|
}
|
|
if _, ok := l.readMarker(); ok {
|
|
t.Fatal("recovery did not clear the marker")
|
|
}
|
|
}
|
|
|
|
// Recover with no marker is a no-op.
|
|
func TestRecover_NoMarker(t *testing.T) {
|
|
st := &fakeStacks{}
|
|
l := testLoop(t, &fakeBackend{}, st)
|
|
l.Recover()
|
|
if len(st.startedNames()) != 0 {
|
|
t.Fatal("recovery restarted stacks with no marker present")
|
|
}
|
|
}
|
|
|
|
// Single-flight: a cycle that begins with an active marker present is a no-op (no second backup).
|
|
func TestRunOnce_SingleFlight(t *testing.T) {
|
|
be := &fakeBackend{due: true, phases: []string{"done"}}
|
|
st := &fakeStacks{running: []string{"a"}}
|
|
l := testLoop(t, be, st)
|
|
if err := l.writeMarker(Marker{Active: true, StoppedStacks: []string{"a"}}); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if err := l.runOnce(context.Background()); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if be.startCalls != 0 {
|
|
t.Fatal("started a backup while a marker was already active")
|
|
}
|
|
}
|
|
|
|
// Only the stacks we stopped are restarted: an already-stopped stack is not in RunningAppStacks,
|
|
// so unquiesce never starts it.
|
|
func TestRunOnce_OnlyRestartsWhatWeStopped(t *testing.T) {
|
|
// "db" was already stopped before quiesce → not in running → not restarted.
|
|
be := &fakeBackend{due: true, phases: []string{"done"}}
|
|
st := &fakeStacks{running: []string{"web"}} // only web is up
|
|
l := testLoop(t, be, st)
|
|
if err := l.runOnce(context.Background()); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
for _, s := range st.startedNames() {
|
|
if s == "db" {
|
|
t.Fatal("restarted a stack that was already stopped before quiesce")
|
|
}
|
|
}
|
|
if got := st.startedNames(); len(got) != 1 || got[0] != "web" {
|
|
t.Fatalf("expected only web restarted, got %v", got)
|
|
}
|
|
}
|
|
|
|
// Marker is written BEFORE stacks are stopped (crash-safety ordering): if stop is observed, the
|
|
// marker must already exist on disk.
|
|
func TestRunOnce_MarkerWrittenBeforeStop(t *testing.T) {
|
|
st := &fakeStacks{running: []string{"a"}}
|
|
l := testLoop(t, &fakeBackend{due: true, phases: []string{"done"}}, st)
|
|
// Wrap StopStack via a stacks decorator that checks the marker file exists at stop time.
|
|
markerSeen := false
|
|
dec := &stopObserver{inner: st, onStop: func() {
|
|
if _, err := os.Stat(l.markerPath); err == nil {
|
|
markerSeen = true
|
|
}
|
|
}}
|
|
l.stacks = dec
|
|
if err := l.runOnce(context.Background()); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if !markerSeen {
|
|
t.Fatal("marker was not on disk when the first stack was stopped (crash-safety ordering violated)")
|
|
}
|
|
}
|
|
|
|
type stopObserver struct {
|
|
inner Stacks
|
|
onStop func()
|
|
}
|
|
|
|
func (s *stopObserver) RunningAppStacks() []string { return s.inner.RunningAppStacks() }
|
|
func (s *stopObserver) StopStack(n string) error { s.onStop(); return s.inner.StopStack(n) }
|
|
func (s *stopObserver) StartStack(n string) error { return s.inner.StartStack(n) }
|
|
|
|
type errString string
|
|
|
|
func (e errString) Error() string { return string(e) }
|