diff --git a/CHANGELOG.md b/CHANGELOG.md index a0f4f62..0233929 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,37 @@ ## Changelog +### v0.36.0 — slice 8B: app-consistent backup quiesce loop (stack-stop) (2026-06-10) + +The in-guest controller half of slice 8B (doc 03 §6/§8). Pairs with `felhom-agent` v0.11.0. An +agent-initiated vzdump is crash-consistent only (an LXC has no fsfreeze); this makes app-consistency +the controller's job — it stops its app stacks around the backup so the captured state is +clean-shutdown-consistent. + +#### Added +- **`internal/quiesce`** — the background quiesce loop: poll the agent's `GET /backup/due` → when + due, **quiesce** (stop deployed, non-protected, running stacks) → `POST /backup` → poll + `GET /backup/status` to `done`/`failed` → **unquiesce** (restart exactly the stacks it stopped). + - **Crash-safety (the centerpiece — a stranded-down app is worse than a crash-consistent backup):** + a persisted **marker** (atomic, `0600`) written **before** stopping anything; **guaranteed + unquiesce** (a deferred closure restarts the stacks on a backup error, a status-poll error, the + max-quiesce bound, or context cancellation); a **max-quiesce-duration** hard bound that restarts + the app no matter what (the backup continues on the agent); **crash recovery** at startup + (`Recover()` restarts stacks left stopped by a mid-quiesce crash, then clears the marker); and the + marker as a **single-flight** guard. +- **`agentapi`**: `BackupDue` / `StartBackup` / `BackupStatus` methods + a `post` helper. +- **`stacks.Manager.RunningAppStacks()`** — deployed, non-protected, currently-up stacks (protected + infra — traefik/cloudflared/felhom-controller — is never stopped), sorted for deterministic order. +- **`config.QuiesceConfig`** (`quiesce`: enabled, poll_interval, status_poll_interval, + max_quiesce_duration). Wired in `main.go`: `Recover()` at startup, then the loop goroutine, gated on + the local API being configured (a provisioned guest) + quiesce enabled. + +#### Tests +- happy path (stop → backup → poll done → restart exactly those, in order; marker cleared); + **backup-start failure → stacks STILL restarted**; failed phase → restarted; **max-quiesce guard → + restarted at the bound**; **crash recovery → marker stacks restarted + cleared**; single-flight (no + second backup while a marker is active); **only the stacks we stopped are restarted** (an + already-stopped stack is never started); and **marker-written-before-stop** ordering. + ### v0.35.0 — slice 8A: bootstrap.json ingestion + pinned agent local-API client (2026-06-10) The in-guest controller half of slice 8A (doc 03 §6). Pairs with `felhom-agent` v0.10.0. No diff --git a/controller/cmd/controller/main.go b/controller/cmd/controller/main.go index f09a8b7..85b6963 100644 --- a/controller/cmd/controller/main.go +++ b/controller/cmd/controller/main.go @@ -31,6 +31,7 @@ import ( "gitea.dooplex.hu/admin/felhom-controller/internal/metrics" "gitea.dooplex.hu/admin/felhom-controller/internal/monitor" "gitea.dooplex.hu/admin/felhom-controller/internal/notify" + "gitea.dooplex.hu/admin/felhom-controller/internal/quiesce" "gitea.dooplex.hu/admin/felhom-controller/internal/recovery" "gitea.dooplex.hu/admin/felhom-controller/internal/report" "gitea.dooplex.hu/admin/felhom-controller/internal/scheduler" @@ -154,6 +155,11 @@ func main() { ctx, cancel := context.WithCancel(context.Background()) defer cancel() + // --- Quiesce loop (slice 8B): app-consistent backup around the agent vzdump --- + // Runs only when the local API is configured (a provisioned guest) and quiesce is enabled. + // Recover FIRST (restart any stacks left stopped by a crash mid-quiesce), then start the loop. + startQuiesceLoop(ctx, cfg, stackMgr, logger) + // --- Start CPU collector --- cpuCollector := system.NewCPUCollector(5 * time.Second) cpuCollector.Start(ctx) @@ -1301,6 +1307,67 @@ func fileExists(path string) bool { return err == nil } +// quiesceBackend adapts *agentapi.Client to quiesce.Backend (bool/string, decoupled from the +// agentapi response structs). +type quiesceBackend struct{ c *agentapi.Client } + +func (b quiesceBackend) Due(ctx context.Context) (bool, error) { + r, err := b.c.BackupDue(ctx) + return r.Due, err +} +func (b quiesceBackend) StartBackup(ctx context.Context) (string, error) { + r, err := b.c.StartBackup(ctx) + return r.JobID, err +} +func (b quiesceBackend) BackupStatus(ctx context.Context) (string, error) { + r, err := b.c.BackupStatus(ctx) + return r.Phase, err +} + +// startQuiesceLoop wires + starts the slice-8B quiesce loop when the local API is configured and +// quiesce is enabled. It Recovers (restarts stacks left stopped by a mid-quiesce crash) before +// starting the loop goroutine. Non-fatal: any misconfig disables the loop with a log line. +func startQuiesceLoop(ctx context.Context, cfg *config.Config, stackMgr *stacks.Manager, logger *log.Logger) { + if cfg.LocalAPI.Endpoint == "" || cfg.LocalAPI.Token == "" { + return // not a provisioned guest — no agent to back up against + } + if !cfg.Quiesce.QuiesceEnabled() { + logger.Printf("[INFO] [quiesce] disabled by config") + return + } + client, err := agentapi.New(cfg.LocalAPI.Endpoint, cfg.LocalAPI.Token, cfg.LocalAPI.Fingerprint) + if err != nil { + logger.Printf("[WARN] [quiesce] disabled (agent client init failed): %v", err) + return + } + poll := parseDurationOr(cfg.Quiesce.PollInterval, 5*time.Minute) + statusPoll := parseDurationOr(cfg.Quiesce.StatusPoll, 10*time.Second) + maxQuiesce := parseDurationOr(cfg.Quiesce.MaxQuiesce, 30*time.Minute) + loop := quiesce.New(quiesce.Options{ + Backend: quiesceBackend{c: client}, + Stacks: stackMgr, + MarkerPath: filepath.Join(cfg.Paths.DataDir, "quiesce-state.json"), + Poll: poll, + StatusPoll: statusPoll, + MaxQuiesce: maxQuiesce, + Logger: logger, + }) + loop.Recover() // crash-safety: restart any stacks stranded-down by a mid-quiesce crash + go loop.Run(ctx) +} + +// parseDurationOr parses a duration string, falling back to def on empty/invalid input. +func parseDurationOr(s string, def time.Duration) time.Duration { + if s == "" { + return def + } + d, err := time.ParseDuration(s) + if err != nil || d <= 0 { + return def + } + return d +} + // probeLocalAPI proves the controller↔agent local-API channel at startup and logs this guest's // mounts (slice 8A). Non-fatal: it only runs when a local-API endpoint is configured, and any // error is logged for diagnosis without affecting the controller's boot. The leaf SHA-256 from diff --git a/controller/internal/agentapi/client.go b/controller/internal/agentapi/client.go index 4ec2aff..bdc5154 100644 --- a/controller/internal/agentapi/client.go +++ b/controller/internal/agentapi/client.go @@ -7,6 +7,7 @@ package agentapi import ( + "bytes" "context" "crypto/sha256" "crypto/tls" @@ -102,6 +103,77 @@ func (c *Client) Storage(ctx context.Context) (StorageResponse, error) { return out, nil } +// ---- slice 8B: app-consistent backup (quiesce loop) ------------------------------------- + +// DueResponse mirrors the agent's GET /backup/due payload. +type DueResponse struct { + VMID int `json:"vmid"` + Due bool `json:"due"` + Reason string `json:"reason"` +} + +// BackupResponse mirrors the agent's POST /backup payload. +type BackupResponse struct { + VMID int `json:"vmid"` + JobID string `json:"job_id"` + Phase string `json:"phase"` +} + +// StatusResponse mirrors the agent's GET /backup/status payload. +type StatusResponse struct { + VMID int `json:"vmid"` + Phase string `json:"phase"` // idle | running | done | failed + JobID string `json:"job_id"` + Error string `json:"error"` +} + +// Backup status phases (mirror the agent's vocabulary). +const ( + PhaseIdle = "idle" + PhaseRunning = "running" + PhaseDone = "done" + PhaseFailed = "failed" +) + +// BackupDue reports whether a policy-scheduled backup is due for this guest (the quiesce trigger). +func (c *Client) BackupDue(ctx context.Context) (DueResponse, error) { + var out DueResponse + body, err := c.get(ctx, "/backup/due") + if err != nil { + return out, err + } + if err := json.Unmarshal(body, &out); err != nil { + return out, fmt.Errorf("agentapi: decode /backup/due: %w", err) + } + return out, nil +} + +// StartBackup enqueues a backup of this guest (the agent vzdump) and returns the job to poll. +func (c *Client) StartBackup(ctx context.Context) (BackupResponse, error) { + var out BackupResponse + body, err := c.post(ctx, "/backup", struct{}{}) + if err != nil { + return out, err + } + if err := json.Unmarshal(body, &out); err != nil { + return out, fmt.Errorf("agentapi: decode POST /backup: %w", err) + } + return out, nil +} + +// BackupStatus reports the current/last backup job phase for this guest. +func (c *Client) BackupStatus(ctx context.Context) (StatusResponse, error) { + var out StatusResponse + body, err := c.get(ctx, "/backup/status") + if err != nil { + return out, err + } + if err := json.Unmarshal(body, &out); err != nil { + return out, fmt.Errorf("agentapi: decode /backup/status: %w", err) + } + return out, nil +} + // get issues an authenticated GET and unwraps the {ok,data,error} envelope. func (c *Client) get(ctx context.Context, path string) (json.RawMessage, error) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil) @@ -128,6 +200,38 @@ func (c *Client) get(ctx context.Context, path string) (json.RawMessage, error) return env.Data, nil } +// post issues an authenticated JSON POST and unwraps the {ok,data,error} envelope. The agent +// returns 200 or 202 for accepted requests. +func (c *Client) post(ctx context.Context, path string, body any) (json.RawMessage, error) { + buf, err := json.Marshal(body) + if err != nil { + return nil, err + } + req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(buf)) + if err != nil { + return nil, err + } + req.Header.Set("Authorization", "Bearer "+c.token) + req.Header.Set("Content-Type", "application/json") + resp, err := c.hc.Do(req) + if err != nil { + return nil, fmt.Errorf("agentapi: POST %s: %w", path, err) + } + defer resp.Body.Close() + raw, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) + if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusAccepted { + return nil, fmt.Errorf("agentapi: POST %s: HTTP %d", path, resp.StatusCode) + } + var env apiResponse + if err := json.Unmarshal(raw, &env); err != nil { + return nil, fmt.Errorf("agentapi: POST %s: bad envelope: %w", path, err) + } + if !env.OK { + return nil, fmt.Errorf("agentapi: POST %s: %s", path, env.Error) + } + return env.Data, nil +} + // normalizeFingerprint lowercases and strips ':'/' ' separators, requiring a 64-hex SHA-256. func normalizeFingerprint(fp string) (string, error) { s := strings.ToLower(strings.NewReplacer(":", "", " ", "", "\t", "").Replace(strings.TrimSpace(fp))) diff --git a/controller/internal/config/config.go b/controller/internal/config/config.go index be92809..91e1bfb 100644 --- a/controller/internal/config/config.go +++ b/controller/internal/config/config.go @@ -29,6 +29,7 @@ type Config struct { Assets AssetsConfig `yaml:"assets"` System SystemConfig `yaml:"system"` LocalAPI LocalAPIConfig `yaml:"local_api"` + Quiesce QuiesceConfig `yaml:"quiesce"` } // LocalAPIConfig is the in-guest controller's handle on the host agent's per-guest local API @@ -40,6 +41,22 @@ type LocalAPIConfig struct { Token string `yaml:"token"` // per-guest bearer; SECRET } +// QuiesceConfig tunes the slice-8B app-consistent backup loop (doc 03 §8): poll the agent's +// /backup/due, and when due stop the app stacks around the agent vzdump for a clean-shutdown +// backup. Runs only when the local API is configured (a provisioned guest). MaxQuiesce bounds the +// app downtime — the controller unquiesces no matter what once it elapses. +type QuiesceConfig struct { + Enabled *bool `yaml:"enabled"` // nil/true → on when local API configured; false → off + PollInterval string `yaml:"poll_interval"` // /backup/due check cadence (default "5m") + StatusPoll string `yaml:"status_poll_interval"` // /backup/status poll while quiesced (default "10s") + MaxQuiesce string `yaml:"max_quiesce_duration"` // hard downtime bound (default "30m") +} + +// QuiesceEnabled reports whether the quiesce loop should run (default on unless explicitly false). +func (q QuiesceConfig) QuiesceEnabled() bool { + return q.Enabled == nil || *q.Enabled +} + type SystemConfig struct { ReservedMemoryMB int `yaml:"reserved_memory_mb"` } @@ -278,6 +295,9 @@ func applyDefaults(cfg *Config) { d(&cfg.Assets.SourceURL, "https://felhom.eu") d(&cfg.Assets.SyncSchedule, "05:00") di(&cfg.System.ReservedMemoryMB, 384) + d(&cfg.Quiesce.PollInterval, "5m") + d(&cfg.Quiesce.StatusPoll, "10s") + d(&cfg.Quiesce.MaxQuiesce, "30m") } func applyEnvOverrides(cfg *Config) { diff --git a/controller/internal/quiesce/quiesce.go b/controller/internal/quiesce/quiesce.go new file mode 100644 index 0000000..a212d1d --- /dev/null +++ b/controller/internal/quiesce/quiesce.go @@ -0,0 +1,266 @@ +// Package quiesce implements the slice-8B app-consistent backup loop (doc 03 §6/§8): the +// in-guest controller polls the host agent's GET /backup/due, and when due it QUIESCES (stops its +// app stacks) → POST /backup → polls GET /backup/status to completion → UNQUIESCES (restarts +// exactly the stacks it stopped). An agent-initiated vzdump is crash-consistent only (an LXC has +// no fsfreeze); stopping the stacks first makes the captured state clean-shutdown-consistent. +// +// The correctness centerpiece is crash-safety: a stranded-down app is worse than a crash-consistent +// backup. So: a persisted marker is written BEFORE stopping anything; unquiesce is guaranteed (it +// runs even when the backup errors or times out); a max-quiesce bound restarts the app no matter +// what; and on controller startup Recover() restarts any stacks left stopped by a mid-quiesce crash. +package quiesce + +import ( + "context" + "encoding/json" + "fmt" + "log" + "os" + "path/filepath" + "time" +) + +// Backend is the agent local-API surface the loop needs (satisfied by an adapter over +// *agentapi.Client). Kept minimal (bool/string) so the loop is testable with plain fakes. +type Backend interface { + Due(ctx context.Context) (bool, error) + StartBackup(ctx context.Context) (jobID string, err error) + BackupStatus(ctx context.Context) (phase string, err error) +} + +// Stacks is the stack-control surface (satisfied by *stacks.Manager). RunningAppStacks must return +// only deployed, non-protected, currently-up stacks (so unquiesce restarts exactly those). +type Stacks interface { + RunningAppStacks() []string + StopStack(name string) error + StartStack(name string) error +} + +// Backup status phases (mirror the agent's vocabulary). +const ( + phaseDone = "done" + phaseFailed = "failed" +) + +// Marker is the persisted quiesce state — the crash-safety + single-flight record. It is written +// (atomically, 0600) BEFORE any stack is stopped, so a controller crash mid-quiesce leaves a +// durable "these stacks were stopped, restart them" note that Recover honors at next startup. +type Marker struct { + Active bool `json:"active"` + StartedAt time.Time `json:"started_at"` + StoppedStacks []string `json:"stopped_stacks"` + JobID string `json:"job_id"` +} + +// Options configures a Loop. +type Options struct { + Backend Backend + Stacks Stacks + MarkerPath string // persisted marker (e.g. /quiesce-state.json) + Poll time.Duration // how often to check /backup/due + StatusPoll time.Duration // how often to poll /backup/status while quiesced + MaxQuiesce time.Duration // hard bound on app downtime (unquiesce no matter what) + Logger *log.Logger +} + +// Loop is the quiesce background loop. +type Loop struct { + backend Backend + stacks Stacks + markerPath string + poll time.Duration + statusPoll time.Duration + maxQuiesce time.Duration + logger *log.Logger + now func() time.Time +} + +// New builds a Loop with sane defaults for any unset duration. +func New(o Options) *Loop { + if o.Poll <= 0 { + o.Poll = 5 * time.Minute + } + if o.StatusPoll <= 0 { + o.StatusPoll = 10 * time.Second + } + if o.MaxQuiesce <= 0 { + o.MaxQuiesce = 30 * time.Minute + } + if o.Logger == nil { + o.Logger = log.Default() + } + return &Loop{ + backend: o.Backend, stacks: o.Stacks, markerPath: o.MarkerPath, + poll: o.Poll, statusPoll: o.StatusPoll, maxQuiesce: o.MaxQuiesce, + logger: o.Logger, now: time.Now, + } +} + +// Recover restarts any stacks left stopped by a controller crash mid-quiesce, then clears the +// marker. Call ONCE at startup, before Run. Idempotent — StartStack on an already-running stack is +// tolerated; an absent/inactive marker is a no-op. +func (l *Loop) Recover() { + m, ok := l.readMarker() + if !ok || !m.Active { + return + } + l.logger.Printf("[WARN] [quiesce] crash recovery: a quiesce was in progress (job %q, %d stack(s) stopped) — restarting them", + m.JobID, len(m.StoppedStacks)) + l.restartAll(m.StoppedStacks) + if err := l.clearMarker(); err != nil { + l.logger.Printf("[ERROR] [quiesce] crash recovery: clear marker: %v", err) + } +} + +// Run polls for a due backup and runs the quiesce cycle, until ctx is cancelled. +func (l *Loop) Run(ctx context.Context) { + l.logger.Printf("[INFO] [quiesce] loop started (poll %s, max-quiesce %s)", l.poll, l.maxQuiesce) + ticker := time.NewTicker(l.poll) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + l.logger.Printf("[INFO] [quiesce] loop stopping") + return + case <-ticker.C: + if err := l.runOnce(ctx); err != nil && ctx.Err() == nil { + l.logger.Printf("[ERROR] [quiesce] cycle error: %v", err) + } + } + } +} + +// runOnce performs one due-check → (if due) quiesce → backup → poll → unquiesce cycle. Unquiesce +// is guaranteed via the deferred closure: a backup error, a status-poll error, the max-quiesce +// bound, or context cancellation all still restart the stacks and clear the marker. +func (l *Loop) runOnce(ctx context.Context) error { + // Defensive single-flight: never quiesce on top of an active marker (Recover clears one left + // by a crash; within a process the single loop goroutine already serializes). + if m, ok := l.readMarker(); ok && m.Active { + l.logger.Printf("[WARN] [quiesce] a marker is already active — skipping this cycle") + return nil + } + + due, err := l.backend.Due(ctx) + if err != nil { + return fmt.Errorf("check due: %w", err) + } + if !due { + return nil + } + + running := l.stacks.RunningAppStacks() + marker := Marker{Active: true, StartedAt: l.now(), StoppedStacks: running} + if err := l.writeMarker(marker); err != nil { + return fmt.Errorf("write quiesce marker (refusing to stop stacks unprotected): %w", err) + } + + // GUARANTEED unquiesce + marker clear — runs on every exit path below. + unquiesced := false + unquiesce := func(reason string) { + if unquiesced { + return + } + unquiesced = true + l.logger.Printf("[INFO] [quiesce] unquiescing (%s): restarting %d stack(s)", reason, len(running)) + l.restartAll(running) + if err := l.clearMarker(); err != nil { + l.logger.Printf("[ERROR] [quiesce] clear marker: %v", err) + } + } + defer unquiesce("deferred") + + l.logger.Printf("[INFO] [quiesce] backup due — quiescing %d stack(s): %v", len(running), running) + for _, s := range running { + if err := l.stacks.StopStack(s); err != nil { + l.logger.Printf("[ERROR] [quiesce] stop %s: %v (continuing)", s, err) + } + } + + jobID, err := l.backend.StartBackup(ctx) + if err != nil { + unquiesce("backup start failed") + return fmt.Errorf("start backup: %w", err) + } + marker.JobID = jobID + _ = l.writeMarker(marker) // best-effort: record the job id for diagnosis + l.logger.Printf("[INFO] [quiesce] backup job %s started — polling to completion", jobID) + + deadline := l.now().Add(l.maxQuiesce) + for { + if !l.now().Before(deadline) { + l.logger.Printf("[WARN] [quiesce] max-quiesce-duration (%s) exceeded for job %s — unquiescing while the backup continues on the agent", + l.maxQuiesce, jobID) + unquiesce("max-quiesce guard") + return nil + } + phase, err := l.backend.BackupStatus(ctx) + if err != nil { + unquiesce("status poll failed") + return fmt.Errorf("poll backup status: %w", err) + } + switch phase { + case phaseDone: + l.logger.Printf("[INFO] [quiesce] backup job %s done", jobID) + unquiesce("backup done") + return nil + case phaseFailed: + l.logger.Printf("[WARN] [quiesce] backup job %s failed", jobID) + unquiesce("backup failed") + return nil + } + select { + case <-ctx.Done(): + unquiesce("controller shutting down") + return ctx.Err() + case <-time.After(l.statusPoll): + } + } +} + +func (l *Loop) restartAll(stacks []string) { + for _, s := range stacks { + if err := l.stacks.StartStack(s); err != nil { + l.logger.Printf("[ERROR] [quiesce] restart %s: %v", s, err) + } + } +} + +// ---- marker persistence (atomic, 0600) -------------------------------------------------- + +func (l *Loop) writeMarker(m Marker) error { + m.Active = true + data, err := json.MarshalIndent(m, "", " ") + if err != nil { + return err + } + if err := os.MkdirAll(filepath.Dir(l.markerPath), 0o755); err != nil { + return err + } + tmp := l.markerPath + ".tmp" + if err := os.WriteFile(tmp, data, 0o600); err != nil { + os.Remove(tmp) + return err + } + return os.Rename(tmp, l.markerPath) +} + +func (l *Loop) readMarker() (Marker, bool) { + data, err := os.ReadFile(l.markerPath) + if err != nil { + return Marker{}, false + } + var m Marker + if json.Unmarshal(data, &m) != nil { + return Marker{}, false + } + return m, true +} + +func (l *Loop) clearMarker() error { + err := os.Remove(l.markerPath) + if os.IsNotExist(err) { + return nil + } + return err +} diff --git a/controller/internal/quiesce/quiesce_test.go b/controller/internal/quiesce/quiesce_test.go new file mode 100644 index 0000000..478a8ab --- /dev/null +++ b/controller/internal/quiesce/quiesce_test.go @@ -0,0 +1,302 @@ +package quiesce + +import ( + "context" + "io" + "log" + "os" + "path/filepath" + "sync" + "testing" + "time" +) + +// fakeStacks records stop/start calls in order. +type fakeStacks struct { + mu sync.Mutex + running []string + stopped []string + started []string + stopErr map[string]error +} + +func (f *fakeStacks) RunningAppStacks() []string { + f.mu.Lock() + defer f.mu.Unlock() + return append([]string(nil), f.running...) +} +func (f *fakeStacks) StopStack(name string) error { + f.mu.Lock() + defer f.mu.Unlock() + f.stopped = append(f.stopped, name) + if f.stopErr != nil { + return f.stopErr[name] + } + return nil +} +func (f *fakeStacks) StartStack(name string) error { + f.mu.Lock() + defer f.mu.Unlock() + f.started = append(f.started, name) + return nil +} +func (f *fakeStacks) startedNames() []string { + f.mu.Lock() + defer f.mu.Unlock() + return append([]string(nil), f.started...) +} +func (f *fakeStacks) stoppedNames() []string { + f.mu.Lock() + defer f.mu.Unlock() + return append([]string(nil), f.stopped...) +} + +// fakeBackend drives the agent-side responses. +type fakeBackend struct { + due bool + dueErr error + startErr error + jobID string + phases []string // returned in sequence by BackupStatus; last value repeats + statusErr error + startCalls int + statusCalls int + mu sync.Mutex +} + +func (b *fakeBackend) Due(context.Context) (bool, error) { return b.due, b.dueErr } +func (b *fakeBackend) StartBackup(context.Context) (string, error) { + b.mu.Lock() + b.startCalls++ + b.mu.Unlock() + if b.startErr != nil { + return "", b.startErr + } + if b.jobID == "" { + b.jobID = "job-1" + } + return b.jobID, nil +} +func (b *fakeBackend) BackupStatus(context.Context) (string, error) { + if b.statusErr != nil { + return "", b.statusErr + } + b.mu.Lock() + defer b.mu.Unlock() + i := b.statusCalls + b.statusCalls++ + if i >= len(b.phases) { + if len(b.phases) == 0 { + return PhaseRunning, nil + } + return b.phases[len(b.phases)-1], nil + } + return b.phases[i], nil +} + +const PhaseRunning = "running" // local alias for readability in tests + +func testLoop(t *testing.T, be Backend, st Stacks) *Loop { + t.Helper() + l := New(Options{ + Backend: be, + Stacks: st, + MarkerPath: filepath.Join(t.TempDir(), "quiesce-state.json"), + Poll: time.Hour, + StatusPoll: time.Millisecond, + MaxQuiesce: 5 * time.Second, + Logger: log.New(io.Discard, "", 0), + }) + return l +} + +// Happy path: due → stop running stacks → start backup → poll to done → restart exactly those → clear marker. +func TestRunOnce_HappyPath(t *testing.T) { + be := &fakeBackend{due: true, phases: []string{"running", "running", "done"}} + st := &fakeStacks{running: []string{"nextcloud", "vaultwarden"}} + l := testLoop(t, be, st) + + if err := l.runOnce(context.Background()); err != nil { + t.Fatalf("runOnce: %v", err) + } + if got := st.stoppedNames(); len(got) != 2 || got[0] != "nextcloud" || got[1] != "vaultwarden" { + t.Fatalf("stopped wrong/order: %v", got) + } + if got := st.startedNames(); len(got) != 2 || got[0] != "nextcloud" || got[1] != "vaultwarden" { + t.Fatalf("started wrong/order: %v", got) + } + if be.startCalls != 1 { + t.Fatalf("expected 1 backup, got %d", be.startCalls) + } + if _, ok := l.readMarker(); ok { + t.Fatal("marker not cleared after a successful cycle") + } +} + +// Not due → nothing happens. +func TestRunOnce_NotDue(t *testing.T) { + be := &fakeBackend{due: false} + st := &fakeStacks{running: []string{"a"}} + l := testLoop(t, be, st) + if err := l.runOnce(context.Background()); err != nil { + t.Fatal(err) + } + if len(st.stoppedNames()) != 0 || be.startCalls != 0 { + t.Fatal("acted while not due") + } +} + +// Backup START fails → stacks STILL restarted (guaranteed unquiesce). +func TestRunOnce_BackupStartFails_StillRestarts(t *testing.T) { + be := &fakeBackend{due: true, startErr: errString("boom")} + st := &fakeStacks{running: []string{"a", "b"}} + l := testLoop(t, be, st) + + _ = l.runOnce(context.Background()) + if got := st.startedNames(); len(got) != 2 { + t.Fatalf("stacks not restarted after a backup-start failure: %v", got) + } + if _, ok := l.readMarker(); ok { + t.Fatal("marker not cleared after a failed backup") + } +} + +// Backup reports failed → stacks restarted. +func TestRunOnce_BackupFailedPhase_Restarts(t *testing.T) { + be := &fakeBackend{due: true, phases: []string{"running", "failed"}} + st := &fakeStacks{running: []string{"a"}} + l := testLoop(t, be, st) + _ = l.runOnce(context.Background()) + if got := st.startedNames(); len(got) != 1 || got[0] != "a" { + t.Fatalf("not restarted after failed phase: %v", got) + } +} + +// Max-quiesce guard: status never reaches done → stacks restarted at the bound. +func TestRunOnce_MaxQuiesceGuard(t *testing.T) { + be := &fakeBackend{due: true, phases: []string{"running"}} // never done + st := &fakeStacks{running: []string{"a", "b"}} + l := testLoop(t, be, st) + // shrink the bound + use a controllable clock so the guard fires fast + base := time.Now() + steps := 0 + l.now = func() time.Time { + steps++ + return base.Add(time.Duration(steps) * time.Minute) // each call advances 1m + } + l.maxQuiesce = 2 * time.Minute + + done := make(chan error, 1) + go func() { done <- l.runOnce(context.Background()) }() + select { + case err := <-done: + if err != nil { + t.Fatalf("runOnce returned error: %v", err) + } + case <-time.After(2 * time.Second): + t.Fatal("runOnce did not return — max-quiesce guard did not fire") + } + if got := st.startedNames(); len(got) != 2 { + t.Fatalf("stacks not restarted at the max-quiesce bound: %v", got) + } + if _, ok := l.readMarker(); ok { + t.Fatal("marker not cleared after the guard fired") + } +} + +// Crash recovery: a marker present at startup → recorded stacks restarted, marker cleared. +func TestRecover_RestartsFromMarker(t *testing.T) { + st := &fakeStacks{} + l := testLoop(t, &fakeBackend{}, st) + // simulate a crash mid-quiesce: an active marker with stopped stacks + if err := l.writeMarker(Marker{Active: true, StoppedStacks: []string{"nextcloud", "immich"}, JobID: "job-x"}); err != nil { + t.Fatal(err) + } + l.Recover() + if got := st.startedNames(); len(got) != 2 || got[0] != "nextcloud" || got[1] != "immich" { + t.Fatalf("recovery did not restart the recorded stacks: %v", got) + } + if _, ok := l.readMarker(); ok { + t.Fatal("recovery did not clear the marker") + } +} + +// Recover with no marker is a no-op. +func TestRecover_NoMarker(t *testing.T) { + st := &fakeStacks{} + l := testLoop(t, &fakeBackend{}, st) + l.Recover() + if len(st.startedNames()) != 0 { + t.Fatal("recovery restarted stacks with no marker present") + } +} + +// Single-flight: a cycle that begins with an active marker present is a no-op (no second backup). +func TestRunOnce_SingleFlight(t *testing.T) { + be := &fakeBackend{due: true, phases: []string{"done"}} + st := &fakeStacks{running: []string{"a"}} + l := testLoop(t, be, st) + if err := l.writeMarker(Marker{Active: true, StoppedStacks: []string{"a"}}); err != nil { + t.Fatal(err) + } + if err := l.runOnce(context.Background()); err != nil { + t.Fatal(err) + } + if be.startCalls != 0 { + t.Fatal("started a backup while a marker was already active") + } +} + +// Only the stacks we stopped are restarted: an already-stopped stack is not in RunningAppStacks, +// so unquiesce never starts it. +func TestRunOnce_OnlyRestartsWhatWeStopped(t *testing.T) { + // "db" was already stopped before quiesce → not in running → not restarted. + be := &fakeBackend{due: true, phases: []string{"done"}} + st := &fakeStacks{running: []string{"web"}} // only web is up + l := testLoop(t, be, st) + if err := l.runOnce(context.Background()); err != nil { + t.Fatal(err) + } + for _, s := range st.startedNames() { + if s == "db" { + t.Fatal("restarted a stack that was already stopped before quiesce") + } + } + if got := st.startedNames(); len(got) != 1 || got[0] != "web" { + t.Fatalf("expected only web restarted, got %v", got) + } +} + +// Marker is written BEFORE stacks are stopped (crash-safety ordering): if stop is observed, the +// marker must already exist on disk. +func TestRunOnce_MarkerWrittenBeforeStop(t *testing.T) { + st := &fakeStacks{running: []string{"a"}} + l := testLoop(t, &fakeBackend{due: true, phases: []string{"done"}}, st) + // Wrap StopStack via a stacks decorator that checks the marker file exists at stop time. + markerSeen := false + dec := &stopObserver{inner: st, onStop: func() { + if _, err := os.Stat(l.markerPath); err == nil { + markerSeen = true + } + }} + l.stacks = dec + if err := l.runOnce(context.Background()); err != nil { + t.Fatal(err) + } + if !markerSeen { + t.Fatal("marker was not on disk when the first stack was stopped (crash-safety ordering violated)") + } +} + +type stopObserver struct { + inner Stacks + onStop func() +} + +func (s *stopObserver) RunningAppStacks() []string { return s.inner.RunningAppStacks() } +func (s *stopObserver) StopStack(n string) error { s.onStop(); return s.inner.StopStack(n) } +func (s *stopObserver) StartStack(n string) error { return s.inner.StartStack(n) } + +type errString string + +func (e errString) Error() string { return string(e) } diff --git a/controller/internal/stacks/manager.go b/controller/internal/stacks/manager.go index 71fad09..1b7fd8e 100644 --- a/controller/internal/stacks/manager.go +++ b/controller/internal/stacks/manager.go @@ -223,6 +223,28 @@ func (m *Manager) DeployedStackNames() []string { return names } +// RunningAppStacks returns the names of deployed, NON-protected stacks that currently have +// containers up (running/starting/unhealthy/restarting) — the set the quiesce loop (slice 8B) +// stops before an app-consistent backup and restarts after. Protected infra (traefik, cloudflared, +// felhom-controller) is excluded so the controller never stops its own tunnel/proxy or itself. +// Sorted for deterministic stop/start order. +func (m *Manager) RunningAppStacks() []string { + m.mu.RLock() + defer m.mu.RUnlock() + var names []string + for name, stack := range m.stacks { + if !stack.Deployed || stack.Protected || m.cfg.IsProtectedStack(name) { + continue + } + switch stack.State { + case StateRunning, StateStarting, StateUnhealthy, StateRestarting: + names = append(names, name) + } + } + sort.Strings(names) + return names +} + // ScanStacks discovers all compose stacks in the stacks directory. func (m *Manager) ScanStacks() error { m.mu.Lock()