Files
felhom-controller/controller/internal/quiesce/quiesce.go
T
admin bbed5af662 controller v0.47.0: backups page — whole-guest backup visibility + manual trigger
Part 2 of the USB/backup spec. agentapi: StatusResponse.Backup record, DueResponse
age_seconds, RestoreTestStatus(). New "Rendszermentés (teljes mentés)" section
(read-only: last backup/target PBS-vs-local/next-due/restore-test) + "Mentés most"
manual trigger that goes through the quiesce loop (controller owns quiescing):
quiesce.Loop gains mutex + TriggerNow() (single-flight, async). New
/api/guest-backup/{trigger,status} (distinct from apiRouter's /api/backup/*).
App-data rows relabeled under an "Alkalmazás-mentések" divider. Config → slice 10.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-12 11:15:25 +02:00

335 lines
12 KiB
Go

// Package quiesce implements the slice-8B app-consistent backup loop (doc 03 §6/§8): the
// in-guest controller polls the host agent's GET /backup/due, and when due it QUIESCES (stops its
// app stacks) → POST /backup → polls GET /backup/status to completion → UNQUIESCES (restarts
// exactly the stacks it stopped). An agent-initiated vzdump is crash-consistent only (an LXC has
// no fsfreeze); stopping the stacks first makes the captured state clean-shutdown-consistent.
//
// The correctness centerpiece is crash-safety: a stranded-down app is worse than a crash-consistent
// backup. So: a persisted marker is written BEFORE stopping anything; unquiesce is guaranteed (it
// runs even when the backup errors or times out); a max-quiesce bound restarts the app no matter
// what; and on controller startup Recover() restarts any stacks left stopped by a mid-quiesce crash.
package quiesce
import (
"context"
"encoding/json"
"errors"
"fmt"
"log"
"os"
"path/filepath"
"sync"
"time"
)
// ErrBackupInProgress is returned by TriggerNow when a scheduled or manual quiesce cycle is already
// running (single-flight). The caller (the "Mentés most" handler) surfaces it as a benign 409.
var ErrBackupInProgress = errors.New("quiesce: a backup cycle is already in progress")
// Backend is the agent local-API surface the loop needs (satisfied by an adapter over
// *agentapi.Client). Kept minimal (bool/string) so the loop is testable with plain fakes.
type Backend interface {
Due(ctx context.Context) (bool, error)
StartBackup(ctx context.Context) (jobID string, err error)
BackupStatus(ctx context.Context) (phase string, err error)
}
// Stacks is the stack-control surface (satisfied by *stacks.Manager). RunningAppStacks must return
// only deployed, non-protected, currently-up stacks (so unquiesce restarts exactly those).
type Stacks interface {
RunningAppStacks() []string
StopStack(name string) error
StartStack(name string) error
}
// Backup status phases (mirror the agent's vocabulary).
const (
phaseSnapshotted = "snapshotted" // 8B.2: storage snapshot taken → app may resume early
phaseDone = "done"
phaseFailed = "failed"
)
// Marker is the persisted quiesce state — the crash-safety + single-flight record. It is written
// (atomically, 0600) BEFORE any stack is stopped, so a controller crash mid-quiesce leaves a
// durable "these stacks were stopped, restart them" note that Recover honors at next startup.
type Marker struct {
Active bool `json:"active"`
StartedAt time.Time `json:"started_at"`
StoppedStacks []string `json:"stopped_stacks"`
JobID string `json:"job_id"`
}
// Options configures a Loop.
type Options struct {
Backend Backend
Stacks Stacks
MarkerPath string // persisted marker (e.g. <data_dir>/quiesce-state.json)
Poll time.Duration // how often to check /backup/due
StatusPoll time.Duration // how often to poll /backup/status while quiesced
MaxQuiesce time.Duration // hard bound on app downtime (unquiesce no matter what)
Logger *log.Logger
}
// Loop is the quiesce background loop.
type Loop struct {
backend Backend
stacks Stacks
markerPath string
poll time.Duration
statusPoll time.Duration
maxQuiesce time.Duration
logger *log.Logger
now func() time.Time
// mu single-flights the quiesce cycle across the scheduled loop AND the manual trigger, so the
// two can never stop the same stacks concurrently (the persisted marker covers crash-safety across
// restarts; this covers concurrency within the process — which a manual trigger introduces).
mu sync.Mutex
}
// New builds a Loop with sane defaults for any unset duration.
func New(o Options) *Loop {
if o.Poll <= 0 {
o.Poll = 5 * time.Minute
}
if o.StatusPoll <= 0 {
o.StatusPoll = 10 * time.Second
}
if o.MaxQuiesce <= 0 {
o.MaxQuiesce = 30 * time.Minute
}
if o.Logger == nil {
o.Logger = log.Default()
}
return &Loop{
backend: o.Backend, stacks: o.Stacks, markerPath: o.MarkerPath,
poll: o.Poll, statusPoll: o.StatusPoll, maxQuiesce: o.MaxQuiesce,
logger: o.Logger, now: time.Now,
}
}
// Recover restarts any stacks left stopped by a controller crash mid-quiesce, then clears the
// marker. Call ONCE at startup, before Run. Idempotent — StartStack on an already-running stack is
// tolerated; an absent/inactive marker is a no-op.
func (l *Loop) Recover() {
m, ok := l.readMarker()
if !ok || !m.Active {
return
}
l.logger.Printf("[WARN] [quiesce] crash recovery: a quiesce was in progress (job %q, %d stack(s) stopped) — restarting them",
m.JobID, len(m.StoppedStacks))
l.restartAll(m.StoppedStacks)
if err := l.clearMarker(); err != nil {
l.logger.Printf("[ERROR] [quiesce] crash recovery: clear marker: %v", err)
}
}
// Run polls for a due backup and runs the quiesce cycle, until ctx is cancelled.
func (l *Loop) Run(ctx context.Context) {
l.logger.Printf("[INFO] [quiesce] loop started (poll %s, max-quiesce %s)", l.poll, l.maxQuiesce)
ticker := time.NewTicker(l.poll)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
l.logger.Printf("[INFO] [quiesce] loop stopping")
return
case <-ticker.C:
if err := l.runOnce(ctx); err != nil && ctx.Err() == nil {
l.logger.Printf("[ERROR] [quiesce] cycle error: %v", err)
}
}
}
}
// runOnce performs one due-check → (if due) quiesce → backup → poll → unquiesce cycle. Unquiesce
// is guaranteed via the deferred closure: a backup error, a status-poll error, the max-quiesce
// bound, or context cancellation all still restart the stacks and clear the marker.
func (l *Loop) runOnce(ctx context.Context) error {
// Single-flight: skip the scheduled check if a cycle (scheduled or manual) is already running.
if !l.mu.TryLock() {
l.logger.Printf("[INFO] [quiesce] a backup cycle is already running — skipping this scheduled check")
return nil
}
defer l.mu.Unlock()
// Defensive single-flight: never quiesce on top of an active marker (Recover clears one left
// by a crash; the mutex above serializes within the process).
if m, ok := l.readMarker(); ok && m.Active {
l.logger.Printf("[WARN] [quiesce] a marker is already active — skipping this cycle")
return nil
}
due, err := l.backend.Due(ctx)
if err != nil {
return fmt.Errorf("check due: %w", err)
}
if !due {
return nil
}
return l.quiesceAndPoll(ctx)
}
// TriggerNow forces an app-consistent backup NOW (the manual "Mentés most" action), bypassing the
// /backup/due check. It runs the SAME quiesce flow the scheduled loop uses (stop stacks → POST
// /backup → poll → resume), so it is app-consistent and crash-safe (marker-protected). Single-flight
// via the same mutex: it returns ErrBackupInProgress if a scheduled or manual cycle is already
// running. The cycle runs ASYNCHRONOUSLY (it can take minutes) on a background context bounded by
// maxQuiesce; the caller polls /backup/status for progress. The controller — not the agent — owns
// quiescing (the agent's vzdump is crash-consistent only), so this MUST go through the loop.
func (l *Loop) TriggerNow() error {
if !l.mu.TryLock() {
return ErrBackupInProgress
}
if m, ok := l.readMarker(); ok && m.Active {
l.mu.Unlock()
return ErrBackupInProgress
}
go func() {
defer l.mu.Unlock()
// Detached from any request context; bounded so a hung backup still unquiesces.
ctx, cancel := context.WithTimeout(context.Background(), l.maxQuiesce+5*time.Minute)
defer cancel()
l.logger.Printf("[INFO] [quiesce] manual backup requested — quiescing now")
if err := l.quiesceAndPoll(ctx); err != nil {
l.logger.Printf("[ERROR] [quiesce] manual backup cycle error: %v", err)
}
}()
return nil
}
// quiesceAndPoll performs the marked, guaranteed-unquiesce cycle: write marker → stop running app
// stacks → POST /backup → poll /backup/status → restart exactly the stacks it stopped. The caller
// MUST hold l.mu. Unquiesce is guaranteed via the deferred closure (backup error, status-poll error,
// the max-quiesce bound, or context cancellation all still restart the stacks and clear the marker).
func (l *Loop) quiesceAndPoll(ctx context.Context) error {
running := l.stacks.RunningAppStacks()
marker := Marker{Active: true, StartedAt: l.now(), StoppedStacks: running}
if err := l.writeMarker(marker); err != nil {
return fmt.Errorf("write quiesce marker (refusing to stop stacks unprotected): %w", err)
}
// GUARANTEED unquiesce + marker clear — runs on every exit path below.
unquiesced := false
unquiesce := func(reason string) {
if unquiesced {
return
}
unquiesced = true
l.logger.Printf("[INFO] [quiesce] unquiescing (%s): restarting %d stack(s)", reason, len(running))
l.restartAll(running)
if err := l.clearMarker(); err != nil {
l.logger.Printf("[ERROR] [quiesce] clear marker: %v", err)
}
}
defer unquiesce("deferred")
l.logger.Printf("[INFO] [quiesce] backup due — quiescing %d stack(s): %v", len(running), running)
for _, s := range running {
if err := l.stacks.StopStack(s); err != nil {
l.logger.Printf("[ERROR] [quiesce] stop %s: %v (continuing)", s, err)
}
}
jobID, err := l.backend.StartBackup(ctx)
if err != nil {
unquiesce("backup start failed")
return fmt.Errorf("start backup: %w", err)
}
marker.JobID = jobID
_ = l.writeMarker(marker) // best-effort: record the job id for diagnosis
l.logger.Printf("[INFO] [quiesce] backup job %s started — polling to completion", jobID)
deadline := l.now().Add(l.maxQuiesce)
for {
if !l.now().Before(deadline) {
l.logger.Printf("[WARN] [quiesce] max-quiesce-duration (%s) exceeded for job %s — unquiescing while the backup continues on the agent",
l.maxQuiesce, jobID)
unquiesce("max-quiesce guard")
return nil
}
phase, err := l.backend.BackupStatus(ctx)
if err != nil {
unquiesce("status poll failed")
return fmt.Errorf("poll backup status: %w", err)
}
switch phase {
case phaseSnapshotted:
// 8B.2: the storage snapshot is taken — the app-stopped state is captured, so the app
// may resume NOW (downtime = until-snapshot, not until-backup-done) with no loss of
// app-consistency. unquiesce is idempotent (fires once); we then KEEP polling to
// done/failed so a new backup isn't started until this one truly finishes (and so a
// post-snapshot failure is observed). The marker is cleared on resume — a crash in this
// tail leaves the app already up, nothing to recover.
if !unquiesced {
l.logger.Printf("[INFO] [quiesce] backup job %s snapshotted — resuming app early (8B.2)", jobID)
unquiesce("snapshotted (early resume)")
}
case phaseDone:
// Fallback (stop/downgraded mode never emits snapshotted): resume at done, exactly 8B.
l.logger.Printf("[INFO] [quiesce] backup job %s done", jobID)
unquiesce("backup done")
return nil
case phaseFailed:
// If we already resumed at snapshotted, the app is up — just note the backup failed
// (recorded for the agent's due window when it stores the failed result).
l.logger.Printf("[WARN] [quiesce] backup job %s failed", jobID)
unquiesce("backup failed")
return nil
}
select {
case <-ctx.Done():
unquiesce("controller shutting down")
return ctx.Err()
case <-time.After(l.statusPoll):
}
}
}
func (l *Loop) restartAll(stacks []string) {
for _, s := range stacks {
if err := l.stacks.StartStack(s); err != nil {
l.logger.Printf("[ERROR] [quiesce] restart %s: %v", s, err)
}
}
}
// ---- marker persistence (atomic, 0600) --------------------------------------------------
func (l *Loop) writeMarker(m Marker) error {
m.Active = true
data, err := json.MarshalIndent(m, "", " ")
if err != nil {
return err
}
if err := os.MkdirAll(filepath.Dir(l.markerPath), 0o755); err != nil {
return err
}
tmp := l.markerPath + ".tmp"
if err := os.WriteFile(tmp, data, 0o600); err != nil {
os.Remove(tmp)
return err
}
return os.Rename(tmp, l.markerPath)
}
func (l *Loop) readMarker() (Marker, bool) {
data, err := os.ReadFile(l.markerPath)
if err != nil {
return Marker{}, false
}
var m Marker
if json.Unmarshal(data, &m) != nil {
return Marker{}, false
}
return m, true
}
func (l *Loop) clearMarker() error {
err := os.Remove(l.markerPath)
if os.IsNotExist(err) {
return nil
}
return err
}