// Package quiesce implements the slice-8B app-consistent backup loop (doc 03 §6/§8): the // in-guest controller polls the host agent's GET /backup/due, and when due it QUIESCES (stops its // app stacks) → POST /backup → polls GET /backup/status to completion → UNQUIESCES (restarts // exactly the stacks it stopped). An agent-initiated vzdump is crash-consistent only (an LXC has // no fsfreeze); stopping the stacks first makes the captured state clean-shutdown-consistent. // // The correctness centerpiece is crash-safety: a stranded-down app is worse than a crash-consistent // backup. So: a persisted marker is written BEFORE stopping anything; unquiesce is guaranteed (it // runs even when the backup errors or times out); a max-quiesce bound restarts the app no matter // what; and on controller startup Recover() restarts any stacks left stopped by a mid-quiesce crash. package quiesce import ( "context" "encoding/json" "fmt" "log" "os" "path/filepath" "time" ) // Backend is the agent local-API surface the loop needs (satisfied by an adapter over // *agentapi.Client). Kept minimal (bool/string) so the loop is testable with plain fakes. type Backend interface { Due(ctx context.Context) (bool, error) StartBackup(ctx context.Context) (jobID string, err error) BackupStatus(ctx context.Context) (phase string, err error) } // Stacks is the stack-control surface (satisfied by *stacks.Manager). RunningAppStacks must return // only deployed, non-protected, currently-up stacks (so unquiesce restarts exactly those). type Stacks interface { RunningAppStacks() []string StopStack(name string) error StartStack(name string) error } // Backup status phases (mirror the agent's vocabulary). const ( phaseSnapshotted = "snapshotted" // 8B.2: storage snapshot taken → app may resume early phaseDone = "done" phaseFailed = "failed" ) // Marker is the persisted quiesce state — the crash-safety + single-flight record. It is written // (atomically, 0600) BEFORE any stack is stopped, so a controller crash mid-quiesce leaves a // durable "these stacks were stopped, restart them" note that Recover honors at next startup. type Marker struct { Active bool `json:"active"` StartedAt time.Time `json:"started_at"` StoppedStacks []string `json:"stopped_stacks"` JobID string `json:"job_id"` } // Options configures a Loop. type Options struct { Backend Backend Stacks Stacks MarkerPath string // persisted marker (e.g. /quiesce-state.json) Poll time.Duration // how often to check /backup/due StatusPoll time.Duration // how often to poll /backup/status while quiesced MaxQuiesce time.Duration // hard bound on app downtime (unquiesce no matter what) Logger *log.Logger } // Loop is the quiesce background loop. type Loop struct { backend Backend stacks Stacks markerPath string poll time.Duration statusPoll time.Duration maxQuiesce time.Duration logger *log.Logger now func() time.Time } // New builds a Loop with sane defaults for any unset duration. func New(o Options) *Loop { if o.Poll <= 0 { o.Poll = 5 * time.Minute } if o.StatusPoll <= 0 { o.StatusPoll = 10 * time.Second } if o.MaxQuiesce <= 0 { o.MaxQuiesce = 30 * time.Minute } if o.Logger == nil { o.Logger = log.Default() } return &Loop{ backend: o.Backend, stacks: o.Stacks, markerPath: o.MarkerPath, poll: o.Poll, statusPoll: o.StatusPoll, maxQuiesce: o.MaxQuiesce, logger: o.Logger, now: time.Now, } } // Recover restarts any stacks left stopped by a controller crash mid-quiesce, then clears the // marker. Call ONCE at startup, before Run. Idempotent — StartStack on an already-running stack is // tolerated; an absent/inactive marker is a no-op. func (l *Loop) Recover() { m, ok := l.readMarker() if !ok || !m.Active { return } l.logger.Printf("[WARN] [quiesce] crash recovery: a quiesce was in progress (job %q, %d stack(s) stopped) — restarting them", m.JobID, len(m.StoppedStacks)) l.restartAll(m.StoppedStacks) if err := l.clearMarker(); err != nil { l.logger.Printf("[ERROR] [quiesce] crash recovery: clear marker: %v", err) } } // Run polls for a due backup and runs the quiesce cycle, until ctx is cancelled. func (l *Loop) Run(ctx context.Context) { l.logger.Printf("[INFO] [quiesce] loop started (poll %s, max-quiesce %s)", l.poll, l.maxQuiesce) ticker := time.NewTicker(l.poll) defer ticker.Stop() for { select { case <-ctx.Done(): l.logger.Printf("[INFO] [quiesce] loop stopping") return case <-ticker.C: if err := l.runOnce(ctx); err != nil && ctx.Err() == nil { l.logger.Printf("[ERROR] [quiesce] cycle error: %v", err) } } } } // runOnce performs one due-check → (if due) quiesce → backup → poll → unquiesce cycle. Unquiesce // is guaranteed via the deferred closure: a backup error, a status-poll error, the max-quiesce // bound, or context cancellation all still restart the stacks and clear the marker. func (l *Loop) runOnce(ctx context.Context) error { // Defensive single-flight: never quiesce on top of an active marker (Recover clears one left // by a crash; within a process the single loop goroutine already serializes). if m, ok := l.readMarker(); ok && m.Active { l.logger.Printf("[WARN] [quiesce] a marker is already active — skipping this cycle") return nil } due, err := l.backend.Due(ctx) if err != nil { return fmt.Errorf("check due: %w", err) } if !due { return nil } running := l.stacks.RunningAppStacks() marker := Marker{Active: true, StartedAt: l.now(), StoppedStacks: running} if err := l.writeMarker(marker); err != nil { return fmt.Errorf("write quiesce marker (refusing to stop stacks unprotected): %w", err) } // GUARANTEED unquiesce + marker clear — runs on every exit path below. unquiesced := false unquiesce := func(reason string) { if unquiesced { return } unquiesced = true l.logger.Printf("[INFO] [quiesce] unquiescing (%s): restarting %d stack(s)", reason, len(running)) l.restartAll(running) if err := l.clearMarker(); err != nil { l.logger.Printf("[ERROR] [quiesce] clear marker: %v", err) } } defer unquiesce("deferred") l.logger.Printf("[INFO] [quiesce] backup due — quiescing %d stack(s): %v", len(running), running) for _, s := range running { if err := l.stacks.StopStack(s); err != nil { l.logger.Printf("[ERROR] [quiesce] stop %s: %v (continuing)", s, err) } } jobID, err := l.backend.StartBackup(ctx) if err != nil { unquiesce("backup start failed") return fmt.Errorf("start backup: %w", err) } marker.JobID = jobID _ = l.writeMarker(marker) // best-effort: record the job id for diagnosis l.logger.Printf("[INFO] [quiesce] backup job %s started — polling to completion", jobID) deadline := l.now().Add(l.maxQuiesce) for { if !l.now().Before(deadline) { l.logger.Printf("[WARN] [quiesce] max-quiesce-duration (%s) exceeded for job %s — unquiescing while the backup continues on the agent", l.maxQuiesce, jobID) unquiesce("max-quiesce guard") return nil } phase, err := l.backend.BackupStatus(ctx) if err != nil { unquiesce("status poll failed") return fmt.Errorf("poll backup status: %w", err) } switch phase { case phaseSnapshotted: // 8B.2: the storage snapshot is taken — the app-stopped state is captured, so the app // may resume NOW (downtime = until-snapshot, not until-backup-done) with no loss of // app-consistency. unquiesce is idempotent (fires once); we then KEEP polling to // done/failed so a new backup isn't started until this one truly finishes (and so a // post-snapshot failure is observed). The marker is cleared on resume — a crash in this // tail leaves the app already up, nothing to recover. if !unquiesced { l.logger.Printf("[INFO] [quiesce] backup job %s snapshotted — resuming app early (8B.2)", jobID) unquiesce("snapshotted (early resume)") } case phaseDone: // Fallback (stop/downgraded mode never emits snapshotted): resume at done, exactly 8B. l.logger.Printf("[INFO] [quiesce] backup job %s done", jobID) unquiesce("backup done") return nil case phaseFailed: // If we already resumed at snapshotted, the app is up — just note the backup failed // (recorded for the agent's due window when it stores the failed result). l.logger.Printf("[WARN] [quiesce] backup job %s failed", jobID) unquiesce("backup failed") return nil } select { case <-ctx.Done(): unquiesce("controller shutting down") return ctx.Err() case <-time.After(l.statusPoll): } } } func (l *Loop) restartAll(stacks []string) { for _, s := range stacks { if err := l.stacks.StartStack(s); err != nil { l.logger.Printf("[ERROR] [quiesce] restart %s: %v", s, err) } } } // ---- marker persistence (atomic, 0600) -------------------------------------------------- func (l *Loop) writeMarker(m Marker) error { m.Active = true data, err := json.MarshalIndent(m, "", " ") if err != nil { return err } if err := os.MkdirAll(filepath.Dir(l.markerPath), 0o755); err != nil { return err } tmp := l.markerPath + ".tmp" if err := os.WriteFile(tmp, data, 0o600); err != nil { os.Remove(tmp) return err } return os.Rename(tmp, l.markerPath) } func (l *Loop) readMarker() (Marker, bool) { data, err := os.ReadFile(l.markerPath) if err != nil { return Marker{}, false } var m Marker if json.Unmarshal(data, &m) != nil { return Marker{}, false } return m, true } func (l *Loop) clearMarker() error { err := os.Remove(l.markerPath) if os.IsNotExist(err) { return nil } return err }