slice 8B (controller half): app-consistent backup quiesce loop (v0.36.0)

internal/quiesce: poll /backup/due -> quiesce (stop app stacks) -> POST /backup
-> poll /backup/status -> unquiesce (restart exactly those). Crash-safety:
persisted marker before stopping, guaranteed unquiesce (defer), max-quiesce
guard, startup Recover, single-flight. agentapi BackupDue/StartBackup/
BackupStatus; stacks.RunningAppStacks(); config QuiesceConfig; main wiring.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-10 10:44:52 +02:00
parent 10685b771c
commit 68fc153d9c
7 changed files with 813 additions and 0 deletions
+67
View File
@@ -31,6 +31,7 @@ import (
"gitea.dooplex.hu/admin/felhom-controller/internal/metrics"
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
"gitea.dooplex.hu/admin/felhom-controller/internal/notify"
"gitea.dooplex.hu/admin/felhom-controller/internal/quiesce"
"gitea.dooplex.hu/admin/felhom-controller/internal/recovery"
"gitea.dooplex.hu/admin/felhom-controller/internal/report"
"gitea.dooplex.hu/admin/felhom-controller/internal/scheduler"
@@ -154,6 +155,11 @@ func main() {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// --- Quiesce loop (slice 8B): app-consistent backup around the agent vzdump ---
// Runs only when the local API is configured (a provisioned guest) and quiesce is enabled.
// Recover FIRST (restart any stacks left stopped by a crash mid-quiesce), then start the loop.
startQuiesceLoop(ctx, cfg, stackMgr, logger)
// --- Start CPU collector ---
cpuCollector := system.NewCPUCollector(5 * time.Second)
cpuCollector.Start(ctx)
@@ -1301,6 +1307,67 @@ func fileExists(path string) bool {
return err == nil
}
// quiesceBackend adapts *agentapi.Client to quiesce.Backend (bool/string, decoupled from the
// agentapi response structs).
type quiesceBackend struct{ c *agentapi.Client }
func (b quiesceBackend) Due(ctx context.Context) (bool, error) {
r, err := b.c.BackupDue(ctx)
return r.Due, err
}
func (b quiesceBackend) StartBackup(ctx context.Context) (string, error) {
r, err := b.c.StartBackup(ctx)
return r.JobID, err
}
func (b quiesceBackend) BackupStatus(ctx context.Context) (string, error) {
r, err := b.c.BackupStatus(ctx)
return r.Phase, err
}
// startQuiesceLoop wires + starts the slice-8B quiesce loop when the local API is configured and
// quiesce is enabled. It Recovers (restarts stacks left stopped by a mid-quiesce crash) before
// starting the loop goroutine. Non-fatal: any misconfig disables the loop with a log line.
func startQuiesceLoop(ctx context.Context, cfg *config.Config, stackMgr *stacks.Manager, logger *log.Logger) {
if cfg.LocalAPI.Endpoint == "" || cfg.LocalAPI.Token == "" {
return // not a provisioned guest — no agent to back up against
}
if !cfg.Quiesce.QuiesceEnabled() {
logger.Printf("[INFO] [quiesce] disabled by config")
return
}
client, err := agentapi.New(cfg.LocalAPI.Endpoint, cfg.LocalAPI.Token, cfg.LocalAPI.Fingerprint)
if err != nil {
logger.Printf("[WARN] [quiesce] disabled (agent client init failed): %v", err)
return
}
poll := parseDurationOr(cfg.Quiesce.PollInterval, 5*time.Minute)
statusPoll := parseDurationOr(cfg.Quiesce.StatusPoll, 10*time.Second)
maxQuiesce := parseDurationOr(cfg.Quiesce.MaxQuiesce, 30*time.Minute)
loop := quiesce.New(quiesce.Options{
Backend: quiesceBackend{c: client},
Stacks: stackMgr,
MarkerPath: filepath.Join(cfg.Paths.DataDir, "quiesce-state.json"),
Poll: poll,
StatusPoll: statusPoll,
MaxQuiesce: maxQuiesce,
Logger: logger,
})
loop.Recover() // crash-safety: restart any stacks stranded-down by a mid-quiesce crash
go loop.Run(ctx)
}
// parseDurationOr parses a duration string, falling back to def on empty/invalid input.
func parseDurationOr(s string, def time.Duration) time.Duration {
if s == "" {
return def
}
d, err := time.ParseDuration(s)
if err != nil || d <= 0 {
return def
}
return d
}
// probeLocalAPI proves the controller↔agent local-API channel at startup and logs this guest's
// mounts (slice 8A). Non-fatal: it only runs when a local-API endpoint is configured, and any
// error is logged for diagnosis without affecting the controller's boot. The leaf SHA-256 from