slice 8B (controller half): app-consistent backup quiesce loop (v0.36.0)

internal/quiesce: poll /backup/due -> quiesce (stop app stacks) -> POST /backup
-> poll /backup/status -> unquiesce (restart exactly those). Crash-safety:
persisted marker before stopping, guaranteed unquiesce (defer), max-quiesce
guard, startup Recover, single-flight. agentapi BackupDue/StartBackup/
BackupStatus; stacks.RunningAppStacks(); config QuiesceConfig; main wiring.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-10 10:44:52 +02:00
parent 10685b771c
commit 68fc153d9c
7 changed files with 813 additions and 0 deletions
+67
View File
@@ -31,6 +31,7 @@ import (
"gitea.dooplex.hu/admin/felhom-controller/internal/metrics"
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
"gitea.dooplex.hu/admin/felhom-controller/internal/notify"
"gitea.dooplex.hu/admin/felhom-controller/internal/quiesce"
"gitea.dooplex.hu/admin/felhom-controller/internal/recovery"
"gitea.dooplex.hu/admin/felhom-controller/internal/report"
"gitea.dooplex.hu/admin/felhom-controller/internal/scheduler"
@@ -154,6 +155,11 @@ func main() {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// --- Quiesce loop (slice 8B): app-consistent backup around the agent vzdump ---
// Runs only when the local API is configured (a provisioned guest) and quiesce is enabled.
// Recover FIRST (restart any stacks left stopped by a crash mid-quiesce), then start the loop.
startQuiesceLoop(ctx, cfg, stackMgr, logger)
// --- Start CPU collector ---
cpuCollector := system.NewCPUCollector(5 * time.Second)
cpuCollector.Start(ctx)
@@ -1301,6 +1307,67 @@ func fileExists(path string) bool {
return err == nil
}
// quiesceBackend adapts *agentapi.Client to quiesce.Backend (bool/string, decoupled from the
// agentapi response structs).
type quiesceBackend struct{ c *agentapi.Client }
func (b quiesceBackend) Due(ctx context.Context) (bool, error) {
r, err := b.c.BackupDue(ctx)
return r.Due, err
}
func (b quiesceBackend) StartBackup(ctx context.Context) (string, error) {
r, err := b.c.StartBackup(ctx)
return r.JobID, err
}
func (b quiesceBackend) BackupStatus(ctx context.Context) (string, error) {
r, err := b.c.BackupStatus(ctx)
return r.Phase, err
}
// startQuiesceLoop wires + starts the slice-8B quiesce loop when the local API is configured and
// quiesce is enabled. It Recovers (restarts stacks left stopped by a mid-quiesce crash) before
// starting the loop goroutine. Non-fatal: any misconfig disables the loop with a log line.
func startQuiesceLoop(ctx context.Context, cfg *config.Config, stackMgr *stacks.Manager, logger *log.Logger) {
if cfg.LocalAPI.Endpoint == "" || cfg.LocalAPI.Token == "" {
return // not a provisioned guest — no agent to back up against
}
if !cfg.Quiesce.QuiesceEnabled() {
logger.Printf("[INFO] [quiesce] disabled by config")
return
}
client, err := agentapi.New(cfg.LocalAPI.Endpoint, cfg.LocalAPI.Token, cfg.LocalAPI.Fingerprint)
if err != nil {
logger.Printf("[WARN] [quiesce] disabled (agent client init failed): %v", err)
return
}
poll := parseDurationOr(cfg.Quiesce.PollInterval, 5*time.Minute)
statusPoll := parseDurationOr(cfg.Quiesce.StatusPoll, 10*time.Second)
maxQuiesce := parseDurationOr(cfg.Quiesce.MaxQuiesce, 30*time.Minute)
loop := quiesce.New(quiesce.Options{
Backend: quiesceBackend{c: client},
Stacks: stackMgr,
MarkerPath: filepath.Join(cfg.Paths.DataDir, "quiesce-state.json"),
Poll: poll,
StatusPoll: statusPoll,
MaxQuiesce: maxQuiesce,
Logger: logger,
})
loop.Recover() // crash-safety: restart any stacks stranded-down by a mid-quiesce crash
go loop.Run(ctx)
}
// parseDurationOr parses a duration string, falling back to def on empty/invalid input.
func parseDurationOr(s string, def time.Duration) time.Duration {
if s == "" {
return def
}
d, err := time.ParseDuration(s)
if err != nil || d <= 0 {
return def
}
return d
}
// probeLocalAPI proves the controller↔agent local-API channel at startup and logs this guest's
// mounts (slice 8A). Non-fatal: it only runs when a local-API endpoint is configured, and any
// error is logged for diagnosis without affecting the controller's boot. The leaf SHA-256 from
+104
View File
@@ -7,6 +7,7 @@
package agentapi
import (
"bytes"
"context"
"crypto/sha256"
"crypto/tls"
@@ -102,6 +103,77 @@ func (c *Client) Storage(ctx context.Context) (StorageResponse, error) {
return out, nil
}
// ---- slice 8B: app-consistent backup (quiesce loop) -------------------------------------
// DueResponse mirrors the agent's GET /backup/due payload.
type DueResponse struct {
VMID int `json:"vmid"`
Due bool `json:"due"`
Reason string `json:"reason"`
}
// BackupResponse mirrors the agent's POST /backup payload.
type BackupResponse struct {
VMID int `json:"vmid"`
JobID string `json:"job_id"`
Phase string `json:"phase"`
}
// StatusResponse mirrors the agent's GET /backup/status payload.
type StatusResponse struct {
VMID int `json:"vmid"`
Phase string `json:"phase"` // idle | running | done | failed
JobID string `json:"job_id"`
Error string `json:"error"`
}
// Backup status phases (mirror the agent's vocabulary).
const (
PhaseIdle = "idle"
PhaseRunning = "running"
PhaseDone = "done"
PhaseFailed = "failed"
)
// BackupDue reports whether a policy-scheduled backup is due for this guest (the quiesce trigger).
func (c *Client) BackupDue(ctx context.Context) (DueResponse, error) {
var out DueResponse
body, err := c.get(ctx, "/backup/due")
if err != nil {
return out, err
}
if err := json.Unmarshal(body, &out); err != nil {
return out, fmt.Errorf("agentapi: decode /backup/due: %w", err)
}
return out, nil
}
// StartBackup enqueues a backup of this guest (the agent vzdump) and returns the job to poll.
func (c *Client) StartBackup(ctx context.Context) (BackupResponse, error) {
var out BackupResponse
body, err := c.post(ctx, "/backup", struct{}{})
if err != nil {
return out, err
}
if err := json.Unmarshal(body, &out); err != nil {
return out, fmt.Errorf("agentapi: decode POST /backup: %w", err)
}
return out, nil
}
// BackupStatus reports the current/last backup job phase for this guest.
func (c *Client) BackupStatus(ctx context.Context) (StatusResponse, error) {
var out StatusResponse
body, err := c.get(ctx, "/backup/status")
if err != nil {
return out, err
}
if err := json.Unmarshal(body, &out); err != nil {
return out, fmt.Errorf("agentapi: decode /backup/status: %w", err)
}
return out, nil
}
// get issues an authenticated GET and unwraps the {ok,data,error} envelope.
func (c *Client) get(ctx context.Context, path string) (json.RawMessage, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil)
@@ -128,6 +200,38 @@ func (c *Client) get(ctx context.Context, path string) (json.RawMessage, error)
return env.Data, nil
}
// post issues an authenticated JSON POST and unwraps the {ok,data,error} envelope. The agent
// returns 200 or 202 for accepted requests.
func (c *Client) post(ctx context.Context, path string, body any) (json.RawMessage, error) {
buf, err := json.Marshal(body)
if err != nil {
return nil, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(buf))
if err != nil {
return nil, err
}
req.Header.Set("Authorization", "Bearer "+c.token)
req.Header.Set("Content-Type", "application/json")
resp, err := c.hc.Do(req)
if err != nil {
return nil, fmt.Errorf("agentapi: POST %s: %w", path, err)
}
defer resp.Body.Close()
raw, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusAccepted {
return nil, fmt.Errorf("agentapi: POST %s: HTTP %d", path, resp.StatusCode)
}
var env apiResponse
if err := json.Unmarshal(raw, &env); err != nil {
return nil, fmt.Errorf("agentapi: POST %s: bad envelope: %w", path, err)
}
if !env.OK {
return nil, fmt.Errorf("agentapi: POST %s: %s", path, env.Error)
}
return env.Data, nil
}
// normalizeFingerprint lowercases and strips ':'/' ' separators, requiring a 64-hex SHA-256.
func normalizeFingerprint(fp string) (string, error) {
s := strings.ToLower(strings.NewReplacer(":", "", " ", "", "\t", "").Replace(strings.TrimSpace(fp)))
+20
View File
@@ -29,6 +29,7 @@ type Config struct {
Assets AssetsConfig `yaml:"assets"`
System SystemConfig `yaml:"system"`
LocalAPI LocalAPIConfig `yaml:"local_api"`
Quiesce QuiesceConfig `yaml:"quiesce"`
}
// LocalAPIConfig is the in-guest controller's handle on the host agent's per-guest local API
@@ -40,6 +41,22 @@ type LocalAPIConfig struct {
Token string `yaml:"token"` // per-guest bearer; SECRET
}
// QuiesceConfig tunes the slice-8B app-consistent backup loop (doc 03 §8): poll the agent's
// /backup/due, and when due stop the app stacks around the agent vzdump for a clean-shutdown
// backup. Runs only when the local API is configured (a provisioned guest). MaxQuiesce bounds the
// app downtime — the controller unquiesces no matter what once it elapses.
type QuiesceConfig struct {
Enabled *bool `yaml:"enabled"` // nil/true → on when local API configured; false → off
PollInterval string `yaml:"poll_interval"` // /backup/due check cadence (default "5m")
StatusPoll string `yaml:"status_poll_interval"` // /backup/status poll while quiesced (default "10s")
MaxQuiesce string `yaml:"max_quiesce_duration"` // hard downtime bound (default "30m")
}
// QuiesceEnabled reports whether the quiesce loop should run (default on unless explicitly false).
func (q QuiesceConfig) QuiesceEnabled() bool {
return q.Enabled == nil || *q.Enabled
}
type SystemConfig struct {
ReservedMemoryMB int `yaml:"reserved_memory_mb"`
}
@@ -278,6 +295,9 @@ func applyDefaults(cfg *Config) {
d(&cfg.Assets.SourceURL, "https://felhom.eu")
d(&cfg.Assets.SyncSchedule, "05:00")
di(&cfg.System.ReservedMemoryMB, 384)
d(&cfg.Quiesce.PollInterval, "5m")
d(&cfg.Quiesce.StatusPoll, "10s")
d(&cfg.Quiesce.MaxQuiesce, "30m")
}
func applyEnvOverrides(cfg *Config) {
+266
View File
@@ -0,0 +1,266 @@
// Package quiesce implements the slice-8B app-consistent backup loop (doc 03 §6/§8): the
// in-guest controller polls the host agent's GET /backup/due, and when due it QUIESCES (stops its
// app stacks) → POST /backup → polls GET /backup/status to completion → UNQUIESCES (restarts
// exactly the stacks it stopped). An agent-initiated vzdump is crash-consistent only (an LXC has
// no fsfreeze); stopping the stacks first makes the captured state clean-shutdown-consistent.
//
// The correctness centerpiece is crash-safety: a stranded-down app is worse than a crash-consistent
// backup. So: a persisted marker is written BEFORE stopping anything; unquiesce is guaranteed (it
// runs even when the backup errors or times out); a max-quiesce bound restarts the app no matter
// what; and on controller startup Recover() restarts any stacks left stopped by a mid-quiesce crash.
package quiesce
import (
"context"
"encoding/json"
"fmt"
"log"
"os"
"path/filepath"
"time"
)
// Backend is the agent local-API surface the loop needs (satisfied by an adapter over
// *agentapi.Client). Kept minimal (bool/string) so the loop is testable with plain fakes.
type Backend interface {
Due(ctx context.Context) (bool, error)
StartBackup(ctx context.Context) (jobID string, err error)
BackupStatus(ctx context.Context) (phase string, err error)
}
// Stacks is the stack-control surface (satisfied by *stacks.Manager). RunningAppStacks must return
// only deployed, non-protected, currently-up stacks (so unquiesce restarts exactly those).
type Stacks interface {
RunningAppStacks() []string
StopStack(name string) error
StartStack(name string) error
}
// Backup status phases (mirror the agent's vocabulary).
const (
phaseDone = "done"
phaseFailed = "failed"
)
// Marker is the persisted quiesce state — the crash-safety + single-flight record. It is written
// (atomically, 0600) BEFORE any stack is stopped, so a controller crash mid-quiesce leaves a
// durable "these stacks were stopped, restart them" note that Recover honors at next startup.
type Marker struct {
Active bool `json:"active"`
StartedAt time.Time `json:"started_at"`
StoppedStacks []string `json:"stopped_stacks"`
JobID string `json:"job_id"`
}
// Options configures a Loop.
type Options struct {
Backend Backend
Stacks Stacks
MarkerPath string // persisted marker (e.g. <data_dir>/quiesce-state.json)
Poll time.Duration // how often to check /backup/due
StatusPoll time.Duration // how often to poll /backup/status while quiesced
MaxQuiesce time.Duration // hard bound on app downtime (unquiesce no matter what)
Logger *log.Logger
}
// Loop is the quiesce background loop.
type Loop struct {
backend Backend
stacks Stacks
markerPath string
poll time.Duration
statusPoll time.Duration
maxQuiesce time.Duration
logger *log.Logger
now func() time.Time
}
// New builds a Loop with sane defaults for any unset duration.
func New(o Options) *Loop {
if o.Poll <= 0 {
o.Poll = 5 * time.Minute
}
if o.StatusPoll <= 0 {
o.StatusPoll = 10 * time.Second
}
if o.MaxQuiesce <= 0 {
o.MaxQuiesce = 30 * time.Minute
}
if o.Logger == nil {
o.Logger = log.Default()
}
return &Loop{
backend: o.Backend, stacks: o.Stacks, markerPath: o.MarkerPath,
poll: o.Poll, statusPoll: o.StatusPoll, maxQuiesce: o.MaxQuiesce,
logger: o.Logger, now: time.Now,
}
}
// Recover restarts any stacks left stopped by a controller crash mid-quiesce, then clears the
// marker. Call ONCE at startup, before Run. Idempotent — StartStack on an already-running stack is
// tolerated; an absent/inactive marker is a no-op.
func (l *Loop) Recover() {
m, ok := l.readMarker()
if !ok || !m.Active {
return
}
l.logger.Printf("[WARN] [quiesce] crash recovery: a quiesce was in progress (job %q, %d stack(s) stopped) — restarting them",
m.JobID, len(m.StoppedStacks))
l.restartAll(m.StoppedStacks)
if err := l.clearMarker(); err != nil {
l.logger.Printf("[ERROR] [quiesce] crash recovery: clear marker: %v", err)
}
}
// Run polls for a due backup and runs the quiesce cycle, until ctx is cancelled.
func (l *Loop) Run(ctx context.Context) {
l.logger.Printf("[INFO] [quiesce] loop started (poll %s, max-quiesce %s)", l.poll, l.maxQuiesce)
ticker := time.NewTicker(l.poll)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
l.logger.Printf("[INFO] [quiesce] loop stopping")
return
case <-ticker.C:
if err := l.runOnce(ctx); err != nil && ctx.Err() == nil {
l.logger.Printf("[ERROR] [quiesce] cycle error: %v", err)
}
}
}
}
// runOnce performs one due-check → (if due) quiesce → backup → poll → unquiesce cycle. Unquiesce
// is guaranteed via the deferred closure: a backup error, a status-poll error, the max-quiesce
// bound, or context cancellation all still restart the stacks and clear the marker.
func (l *Loop) runOnce(ctx context.Context) error {
// Defensive single-flight: never quiesce on top of an active marker (Recover clears one left
// by a crash; within a process the single loop goroutine already serializes).
if m, ok := l.readMarker(); ok && m.Active {
l.logger.Printf("[WARN] [quiesce] a marker is already active — skipping this cycle")
return nil
}
due, err := l.backend.Due(ctx)
if err != nil {
return fmt.Errorf("check due: %w", err)
}
if !due {
return nil
}
running := l.stacks.RunningAppStacks()
marker := Marker{Active: true, StartedAt: l.now(), StoppedStacks: running}
if err := l.writeMarker(marker); err != nil {
return fmt.Errorf("write quiesce marker (refusing to stop stacks unprotected): %w", err)
}
// GUARANTEED unquiesce + marker clear — runs on every exit path below.
unquiesced := false
unquiesce := func(reason string) {
if unquiesced {
return
}
unquiesced = true
l.logger.Printf("[INFO] [quiesce] unquiescing (%s): restarting %d stack(s)", reason, len(running))
l.restartAll(running)
if err := l.clearMarker(); err != nil {
l.logger.Printf("[ERROR] [quiesce] clear marker: %v", err)
}
}
defer unquiesce("deferred")
l.logger.Printf("[INFO] [quiesce] backup due — quiescing %d stack(s): %v", len(running), running)
for _, s := range running {
if err := l.stacks.StopStack(s); err != nil {
l.logger.Printf("[ERROR] [quiesce] stop %s: %v (continuing)", s, err)
}
}
jobID, err := l.backend.StartBackup(ctx)
if err != nil {
unquiesce("backup start failed")
return fmt.Errorf("start backup: %w", err)
}
marker.JobID = jobID
_ = l.writeMarker(marker) // best-effort: record the job id for diagnosis
l.logger.Printf("[INFO] [quiesce] backup job %s started — polling to completion", jobID)
deadline := l.now().Add(l.maxQuiesce)
for {
if !l.now().Before(deadline) {
l.logger.Printf("[WARN] [quiesce] max-quiesce-duration (%s) exceeded for job %s — unquiescing while the backup continues on the agent",
l.maxQuiesce, jobID)
unquiesce("max-quiesce guard")
return nil
}
phase, err := l.backend.BackupStatus(ctx)
if err != nil {
unquiesce("status poll failed")
return fmt.Errorf("poll backup status: %w", err)
}
switch phase {
case phaseDone:
l.logger.Printf("[INFO] [quiesce] backup job %s done", jobID)
unquiesce("backup done")
return nil
case phaseFailed:
l.logger.Printf("[WARN] [quiesce] backup job %s failed", jobID)
unquiesce("backup failed")
return nil
}
select {
case <-ctx.Done():
unquiesce("controller shutting down")
return ctx.Err()
case <-time.After(l.statusPoll):
}
}
}
func (l *Loop) restartAll(stacks []string) {
for _, s := range stacks {
if err := l.stacks.StartStack(s); err != nil {
l.logger.Printf("[ERROR] [quiesce] restart %s: %v", s, err)
}
}
}
// ---- marker persistence (atomic, 0600) --------------------------------------------------
func (l *Loop) writeMarker(m Marker) error {
m.Active = true
data, err := json.MarshalIndent(m, "", " ")
if err != nil {
return err
}
if err := os.MkdirAll(filepath.Dir(l.markerPath), 0o755); err != nil {
return err
}
tmp := l.markerPath + ".tmp"
if err := os.WriteFile(tmp, data, 0o600); err != nil {
os.Remove(tmp)
return err
}
return os.Rename(tmp, l.markerPath)
}
func (l *Loop) readMarker() (Marker, bool) {
data, err := os.ReadFile(l.markerPath)
if err != nil {
return Marker{}, false
}
var m Marker
if json.Unmarshal(data, &m) != nil {
return Marker{}, false
}
return m, true
}
func (l *Loop) clearMarker() error {
err := os.Remove(l.markerPath)
if os.IsNotExist(err) {
return nil
}
return err
}
+302
View File
@@ -0,0 +1,302 @@
package quiesce
import (
"context"
"io"
"log"
"os"
"path/filepath"
"sync"
"testing"
"time"
)
// fakeStacks records stop/start calls in order.
type fakeStacks struct {
mu sync.Mutex
running []string
stopped []string
started []string
stopErr map[string]error
}
func (f *fakeStacks) RunningAppStacks() []string {
f.mu.Lock()
defer f.mu.Unlock()
return append([]string(nil), f.running...)
}
func (f *fakeStacks) StopStack(name string) error {
f.mu.Lock()
defer f.mu.Unlock()
f.stopped = append(f.stopped, name)
if f.stopErr != nil {
return f.stopErr[name]
}
return nil
}
func (f *fakeStacks) StartStack(name string) error {
f.mu.Lock()
defer f.mu.Unlock()
f.started = append(f.started, name)
return nil
}
func (f *fakeStacks) startedNames() []string {
f.mu.Lock()
defer f.mu.Unlock()
return append([]string(nil), f.started...)
}
func (f *fakeStacks) stoppedNames() []string {
f.mu.Lock()
defer f.mu.Unlock()
return append([]string(nil), f.stopped...)
}
// fakeBackend drives the agent-side responses.
type fakeBackend struct {
due bool
dueErr error
startErr error
jobID string
phases []string // returned in sequence by BackupStatus; last value repeats
statusErr error
startCalls int
statusCalls int
mu sync.Mutex
}
func (b *fakeBackend) Due(context.Context) (bool, error) { return b.due, b.dueErr }
func (b *fakeBackend) StartBackup(context.Context) (string, error) {
b.mu.Lock()
b.startCalls++
b.mu.Unlock()
if b.startErr != nil {
return "", b.startErr
}
if b.jobID == "" {
b.jobID = "job-1"
}
return b.jobID, nil
}
func (b *fakeBackend) BackupStatus(context.Context) (string, error) {
if b.statusErr != nil {
return "", b.statusErr
}
b.mu.Lock()
defer b.mu.Unlock()
i := b.statusCalls
b.statusCalls++
if i >= len(b.phases) {
if len(b.phases) == 0 {
return PhaseRunning, nil
}
return b.phases[len(b.phases)-1], nil
}
return b.phases[i], nil
}
const PhaseRunning = "running" // local alias for readability in tests
func testLoop(t *testing.T, be Backend, st Stacks) *Loop {
t.Helper()
l := New(Options{
Backend: be,
Stacks: st,
MarkerPath: filepath.Join(t.TempDir(), "quiesce-state.json"),
Poll: time.Hour,
StatusPoll: time.Millisecond,
MaxQuiesce: 5 * time.Second,
Logger: log.New(io.Discard, "", 0),
})
return l
}
// Happy path: due → stop running stacks → start backup → poll to done → restart exactly those → clear marker.
func TestRunOnce_HappyPath(t *testing.T) {
be := &fakeBackend{due: true, phases: []string{"running", "running", "done"}}
st := &fakeStacks{running: []string{"nextcloud", "vaultwarden"}}
l := testLoop(t, be, st)
if err := l.runOnce(context.Background()); err != nil {
t.Fatalf("runOnce: %v", err)
}
if got := st.stoppedNames(); len(got) != 2 || got[0] != "nextcloud" || got[1] != "vaultwarden" {
t.Fatalf("stopped wrong/order: %v", got)
}
if got := st.startedNames(); len(got) != 2 || got[0] != "nextcloud" || got[1] != "vaultwarden" {
t.Fatalf("started wrong/order: %v", got)
}
if be.startCalls != 1 {
t.Fatalf("expected 1 backup, got %d", be.startCalls)
}
if _, ok := l.readMarker(); ok {
t.Fatal("marker not cleared after a successful cycle")
}
}
// Not due → nothing happens.
func TestRunOnce_NotDue(t *testing.T) {
be := &fakeBackend{due: false}
st := &fakeStacks{running: []string{"a"}}
l := testLoop(t, be, st)
if err := l.runOnce(context.Background()); err != nil {
t.Fatal(err)
}
if len(st.stoppedNames()) != 0 || be.startCalls != 0 {
t.Fatal("acted while not due")
}
}
// Backup START fails → stacks STILL restarted (guaranteed unquiesce).
func TestRunOnce_BackupStartFails_StillRestarts(t *testing.T) {
be := &fakeBackend{due: true, startErr: errString("boom")}
st := &fakeStacks{running: []string{"a", "b"}}
l := testLoop(t, be, st)
_ = l.runOnce(context.Background())
if got := st.startedNames(); len(got) != 2 {
t.Fatalf("stacks not restarted after a backup-start failure: %v", got)
}
if _, ok := l.readMarker(); ok {
t.Fatal("marker not cleared after a failed backup")
}
}
// Backup reports failed → stacks restarted.
func TestRunOnce_BackupFailedPhase_Restarts(t *testing.T) {
be := &fakeBackend{due: true, phases: []string{"running", "failed"}}
st := &fakeStacks{running: []string{"a"}}
l := testLoop(t, be, st)
_ = l.runOnce(context.Background())
if got := st.startedNames(); len(got) != 1 || got[0] != "a" {
t.Fatalf("not restarted after failed phase: %v", got)
}
}
// Max-quiesce guard: status never reaches done → stacks restarted at the bound.
func TestRunOnce_MaxQuiesceGuard(t *testing.T) {
be := &fakeBackend{due: true, phases: []string{"running"}} // never done
st := &fakeStacks{running: []string{"a", "b"}}
l := testLoop(t, be, st)
// shrink the bound + use a controllable clock so the guard fires fast
base := time.Now()
steps := 0
l.now = func() time.Time {
steps++
return base.Add(time.Duration(steps) * time.Minute) // each call advances 1m
}
l.maxQuiesce = 2 * time.Minute
done := make(chan error, 1)
go func() { done <- l.runOnce(context.Background()) }()
select {
case err := <-done:
if err != nil {
t.Fatalf("runOnce returned error: %v", err)
}
case <-time.After(2 * time.Second):
t.Fatal("runOnce did not return — max-quiesce guard did not fire")
}
if got := st.startedNames(); len(got) != 2 {
t.Fatalf("stacks not restarted at the max-quiesce bound: %v", got)
}
if _, ok := l.readMarker(); ok {
t.Fatal("marker not cleared after the guard fired")
}
}
// Crash recovery: a marker present at startup → recorded stacks restarted, marker cleared.
func TestRecover_RestartsFromMarker(t *testing.T) {
st := &fakeStacks{}
l := testLoop(t, &fakeBackend{}, st)
// simulate a crash mid-quiesce: an active marker with stopped stacks
if err := l.writeMarker(Marker{Active: true, StoppedStacks: []string{"nextcloud", "immich"}, JobID: "job-x"}); err != nil {
t.Fatal(err)
}
l.Recover()
if got := st.startedNames(); len(got) != 2 || got[0] != "nextcloud" || got[1] != "immich" {
t.Fatalf("recovery did not restart the recorded stacks: %v", got)
}
if _, ok := l.readMarker(); ok {
t.Fatal("recovery did not clear the marker")
}
}
// Recover with no marker is a no-op.
func TestRecover_NoMarker(t *testing.T) {
st := &fakeStacks{}
l := testLoop(t, &fakeBackend{}, st)
l.Recover()
if len(st.startedNames()) != 0 {
t.Fatal("recovery restarted stacks with no marker present")
}
}
// Single-flight: a cycle that begins with an active marker present is a no-op (no second backup).
func TestRunOnce_SingleFlight(t *testing.T) {
be := &fakeBackend{due: true, phases: []string{"done"}}
st := &fakeStacks{running: []string{"a"}}
l := testLoop(t, be, st)
if err := l.writeMarker(Marker{Active: true, StoppedStacks: []string{"a"}}); err != nil {
t.Fatal(err)
}
if err := l.runOnce(context.Background()); err != nil {
t.Fatal(err)
}
if be.startCalls != 0 {
t.Fatal("started a backup while a marker was already active")
}
}
// Only the stacks we stopped are restarted: an already-stopped stack is not in RunningAppStacks,
// so unquiesce never starts it.
func TestRunOnce_OnlyRestartsWhatWeStopped(t *testing.T) {
// "db" was already stopped before quiesce → not in running → not restarted.
be := &fakeBackend{due: true, phases: []string{"done"}}
st := &fakeStacks{running: []string{"web"}} // only web is up
l := testLoop(t, be, st)
if err := l.runOnce(context.Background()); err != nil {
t.Fatal(err)
}
for _, s := range st.startedNames() {
if s == "db" {
t.Fatal("restarted a stack that was already stopped before quiesce")
}
}
if got := st.startedNames(); len(got) != 1 || got[0] != "web" {
t.Fatalf("expected only web restarted, got %v", got)
}
}
// Marker is written BEFORE stacks are stopped (crash-safety ordering): if stop is observed, the
// marker must already exist on disk.
func TestRunOnce_MarkerWrittenBeforeStop(t *testing.T) {
st := &fakeStacks{running: []string{"a"}}
l := testLoop(t, &fakeBackend{due: true, phases: []string{"done"}}, st)
// Wrap StopStack via a stacks decorator that checks the marker file exists at stop time.
markerSeen := false
dec := &stopObserver{inner: st, onStop: func() {
if _, err := os.Stat(l.markerPath); err == nil {
markerSeen = true
}
}}
l.stacks = dec
if err := l.runOnce(context.Background()); err != nil {
t.Fatal(err)
}
if !markerSeen {
t.Fatal("marker was not on disk when the first stack was stopped (crash-safety ordering violated)")
}
}
type stopObserver struct {
inner Stacks
onStop func()
}
func (s *stopObserver) RunningAppStacks() []string { return s.inner.RunningAppStacks() }
func (s *stopObserver) StopStack(n string) error { s.onStop(); return s.inner.StopStack(n) }
func (s *stopObserver) StartStack(n string) error { return s.inner.StartStack(n) }
type errString string
func (e errString) Error() string { return string(e) }
+22
View File
@@ -223,6 +223,28 @@ func (m *Manager) DeployedStackNames() []string {
return names
}
// RunningAppStacks returns the names of deployed, NON-protected stacks that currently have
// containers up (running/starting/unhealthy/restarting) — the set the quiesce loop (slice 8B)
// stops before an app-consistent backup and restarts after. Protected infra (traefik, cloudflared,
// felhom-controller) is excluded so the controller never stops its own tunnel/proxy or itself.
// Sorted for deterministic stop/start order.
func (m *Manager) RunningAppStacks() []string {
m.mu.RLock()
defer m.mu.RUnlock()
var names []string
for name, stack := range m.stacks {
if !stack.Deployed || stack.Protected || m.cfg.IsProtectedStack(name) {
continue
}
switch stack.State {
case StateRunning, StateStarting, StateUnhealthy, StateRestarting:
names = append(names, name)
}
}
sort.Strings(names)
return names
}
// ScanStacks discovers all compose stacks in the stacks directory.
func (m *Manager) ScanStacks() error {
m.mu.Lock()