v0.41.0: first-boot base-infra bring-up + self-heal (+ Section-G mount fix)
New internal/infra package renders traefik/cloudflared/filebrowser from config (pinned images, single source of truth; web filebrowser path delegates here). stacks.EnsureBaseStack deploys the traefik-public network + the three stacks, single-flight + idempotent + non-fatal; wired to first boot and every health tick. monitor.EffectiveProtected drops cloudflared when no tunnel token. Section-G fix lives in felhom-agent build-golden.sh (same-path stacks bind). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,193 @@
|
||||
package stacks
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/infra"
|
||||
)
|
||||
|
||||
const traefikNetwork = "traefik-public"
|
||||
|
||||
// EnsureBaseStack renders and deploys the base-infrastructure stacks (traefik, cloudflared,
|
||||
// filebrowser) the controller needs for routing + external access. It is:
|
||||
// - single-flight: fired both at first boot and on every health tick; a TryLock ensures two
|
||||
// invocations never race on the same stack dir / run concurrent `compose up` on the same stack.
|
||||
// - idempotent: each stack is skipped when its container is already running, so the healthy-state
|
||||
// re-run is a cheap 3× `docker inspect` no-op.
|
||||
// - non-fatal by contract: returns a joined error for the caller to LOG; it must never crash the
|
||||
// controller. cloudflared is only deployed when a tunnel token is configured (LAN-only nodes
|
||||
// legitimately run without it — see the dynamic protected set in monitor.EffectiveProtected).
|
||||
//
|
||||
// Deploy order is load-bearing: traefik-public network → traefik → cloudflared → filebrowser (the
|
||||
// composes declare the network `external: true`, so it must exist first).
|
||||
func (m *Manager) EnsureBaseStack() error {
|
||||
if !m.infraMu.TryLock() {
|
||||
m.logger.Printf("[INFO] [infra] EnsureBaseStack already in progress — skipping this invocation")
|
||||
return nil
|
||||
}
|
||||
defer m.infraMu.Unlock()
|
||||
|
||||
if err := m.ensureTraefikNetwork(); err != nil {
|
||||
return fmt.Errorf("base-infra: %w", err) // without the network, every stack `up` fails
|
||||
}
|
||||
|
||||
base := m.cfg.Paths.StacksDir
|
||||
var errs []string
|
||||
|
||||
if err := m.ensureTraefik(filepath.Join(base, "traefik")); err != nil {
|
||||
errs = append(errs, fmt.Sprintf("traefik: %v", err))
|
||||
}
|
||||
|
||||
if m.cfg.Infrastructure.CFTunnelToken != "" {
|
||||
if err := m.ensureCloudflared(filepath.Join(base, "cloudflared")); err != nil {
|
||||
errs = append(errs, fmt.Sprintf("cloudflared: %v", err))
|
||||
}
|
||||
} else {
|
||||
m.logger.Printf("[INFO] [infra] cloudflared skipped — no cf_tunnel_token configured (LAN-only node)")
|
||||
}
|
||||
|
||||
if err := m.ensureFileBrowser(filepath.Join(base, "filebrowser")); err != nil {
|
||||
errs = append(errs, fmt.Sprintf("filebrowser: %v", err))
|
||||
}
|
||||
|
||||
if len(errs) > 0 {
|
||||
return fmt.Errorf("base-infra bring-up: %s", strings.Join(errs, "; "))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Manager) ensureTraefik(dir string) error {
|
||||
if containerRunning("traefik") {
|
||||
return nil
|
||||
}
|
||||
m.logger.Printf("[INFO] [infra] deploying traefik → %s", dir)
|
||||
if err := os.MkdirAll(filepath.Join(dir, "dynamic"), 0o755); err != nil {
|
||||
return fmt.Errorf("mkdir dynamic: %w", err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Join(dir, "certs"), 0o755); err != nil {
|
||||
return fmt.Errorf("mkdir certs: %w", err)
|
||||
}
|
||||
// acme.json must exist as a 0600 file before traefik starts (it writes issued certs into it).
|
||||
acme := filepath.Join(dir, "acme.json")
|
||||
if _, err := os.Stat(acme); os.IsNotExist(err) {
|
||||
if err := os.WriteFile(acme, []byte{}, 0o600); err != nil {
|
||||
return fmt.Errorf("create acme.json: %w", err)
|
||||
}
|
||||
}
|
||||
if err := os.Chmod(acme, 0o600); err != nil {
|
||||
return fmt.Errorf("chmod acme.json: %w", err)
|
||||
}
|
||||
files, err := infra.RenderTraefik(infra.TraefikData{
|
||||
ACMEEmail: m.cfg.Customer.Email,
|
||||
CFAPIToken: m.cfg.Infrastructure.CFAPIToken,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := writeInfraFiles(dir, files); err != nil {
|
||||
return err
|
||||
}
|
||||
return m.composeUp(dir)
|
||||
}
|
||||
|
||||
func (m *Manager) ensureCloudflared(dir string) error {
|
||||
if containerRunning("cloudflared") {
|
||||
return nil
|
||||
}
|
||||
m.logger.Printf("[INFO] [infra] deploying cloudflared → %s", dir)
|
||||
files, err := infra.RenderCloudflared(infra.CloudflaredData{CFTunnelToken: m.cfg.Infrastructure.CFTunnelToken})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := writeInfraFiles(dir, files); err != nil {
|
||||
return err
|
||||
}
|
||||
return m.composeUp(dir)
|
||||
}
|
||||
|
||||
func (m *Manager) ensureFileBrowser(dir string) error {
|
||||
if containerRunning("filebrowser") {
|
||||
return nil
|
||||
}
|
||||
composePath := filepath.Join(dir, "docker-compose.yml")
|
||||
if _, err := os.Stat(composePath); err == nil {
|
||||
// Already provisioned but not running — bring it up WITHOUT regenerating, so the storage
|
||||
// mounts that web.SyncFileBrowserMounts manages are preserved. Just `compose up -d`.
|
||||
m.logger.Printf("[INFO] [infra] filebrowser compose exists — starting without regenerating")
|
||||
return m.composeUp(dir)
|
||||
}
|
||||
m.logger.Printf("[INFO] [infra] deploying filebrowser → %s", dir)
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
return fmt.Errorf("mkdir: %w", err)
|
||||
}
|
||||
// Initial render: no storage mounts yet (web.SyncFileBrowserMounts fills them in on the first
|
||||
// storage-path change and owns all later regeneration).
|
||||
compose := infra.RenderFileBrowserCompose(m.cfg.Customer.Domain, nil)
|
||||
config := infra.RenderFileBrowserConfig(nil)
|
||||
if err := os.WriteFile(composePath, []byte(compose), 0o644); err != nil {
|
||||
return fmt.Errorf("write docker-compose.yml: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(dir, "config.yaml"), []byte(config), 0o644); err != nil {
|
||||
return fmt.Errorf("write config.yaml: %w", err)
|
||||
}
|
||||
return m.composeUp(dir)
|
||||
}
|
||||
|
||||
// ensureTraefikNetwork creates the external traefik-public docker network if absent (idempotent;
|
||||
// tolerates a create/inspect race). Uses the docker CLI directly — it's a network op, not compose.
|
||||
func (m *Manager) ensureTraefikNetwork() error {
|
||||
if exec.Command("docker", "network", "inspect", traefikNetwork).Run() == nil {
|
||||
return nil
|
||||
}
|
||||
m.logger.Printf("[INFO] [infra] creating docker network %s", traefikNetwork)
|
||||
out, err := exec.Command("docker", "network", "create", traefikNetwork).CombinedOutput()
|
||||
if err != nil {
|
||||
// Tolerate a race where another actor created it between our inspect and create.
|
||||
if exec.Command("docker", "network", "inspect", traefikNetwork).Run() == nil {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("network create %s: %s: %w", traefikNetwork, strings.TrimSpace(string(out)), err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// composeUp runs `docker compose up -d` in dir (DOMAIN injected by composeExecWithEnv).
|
||||
func (m *Manager) composeUp(dir string) error {
|
||||
out, err := m.composeExecWithEnv(dir, nil, "up", "-d")
|
||||
if err != nil {
|
||||
return fmt.Errorf("compose up: %s: %w", truncateStr(strings.TrimSpace(out), 300), err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// writeInfraFiles writes each rendered file at its required mode (enforced via Chmod so an existing
|
||||
// file — e.g. a re-rendered .env — keeps 0600 even though WriteFile only honors mode on create).
|
||||
func writeInfraFiles(dir string, files map[string]infra.FileSpec) error {
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
return fmt.Errorf("mkdir: %w", err)
|
||||
}
|
||||
for name, spec := range files {
|
||||
p := filepath.Join(dir, name)
|
||||
if err := os.WriteFile(p, []byte(spec.Content), os.FileMode(spec.Mode)); err != nil {
|
||||
return fmt.Errorf("write %s: %w", name, err)
|
||||
}
|
||||
if err := os.Chmod(p, os.FileMode(spec.Mode)); err != nil {
|
||||
return fmt.Errorf("chmod %s: %w", name, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// containerRunning reports whether a container with the given name is currently running. It asks the
|
||||
// daemon directly (works before the stack dir exists), mirroring monitor.checkProtectedContainers.
|
||||
func containerRunning(name string) bool {
|
||||
out, err := exec.Command("docker", "inspect", "--format", "{{.State.Running}}", name).Output()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return strings.TrimSpace(string(out)) == "true"
|
||||
}
|
||||
@@ -0,0 +1,36 @@
|
||||
package stacks
|
||||
|
||||
import (
|
||||
"io"
|
||||
"log"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||||
)
|
||||
|
||||
// TestEnsureBaseStackSingleFlight proves the single-flight guard short-circuits: when infraMu is
|
||||
// already held, EnsureBaseStack returns immediately (nil) WITHOUT touching docker. We hold the lock
|
||||
// in this goroutine and call EnsureBaseStack in the same goroutine — Go mutexes are non-reentrant, so
|
||||
// TryLock fails and the function returns before any docker network/inspect call. If the guard were
|
||||
// missing, the call would shell out to docker (unavailable in unit tests) and not return nil cleanly.
|
||||
func TestEnsureBaseStackSingleFlight(t *testing.T) {
|
||||
m := &Manager{
|
||||
cfg: &config.Config{},
|
||||
logger: log.New(io.Discard, "", 0),
|
||||
}
|
||||
m.infraMu.Lock()
|
||||
defer m.infraMu.Unlock()
|
||||
|
||||
done := make(chan error, 1)
|
||||
go func() { done <- m.EnsureBaseStack() }()
|
||||
|
||||
select {
|
||||
case err := <-done:
|
||||
if err != nil {
|
||||
t.Fatalf("expected nil (single-flight no-op) while lock held, got %v", err)
|
||||
}
|
||||
case <-time.After(3 * time.Second):
|
||||
t.Fatal("EnsureBaseStack did not short-circuit while infraMu was held (single-flight guard missing?)")
|
||||
}
|
||||
}
|
||||
@@ -83,7 +83,8 @@ type Manager struct {
|
||||
composeCmd string
|
||||
stacks map[string]*Stack
|
||||
mu sync.RWMutex
|
||||
encKey []byte // AES-256 key for encrypting sensitive values in app.yaml
|
||||
encKey []byte // AES-256 key for encrypting sensitive values in app.yaml
|
||||
infraMu sync.Mutex // single-flight guard for EnsureBaseStack (base-infra bring-up/self-heal)
|
||||
}
|
||||
|
||||
// NewManager creates a new stack manager.
|
||||
|
||||
Reference in New Issue
Block a user