v0.41.0: first-boot base-infra bring-up + self-heal (+ Section-G mount fix)

New internal/infra package renders traefik/cloudflared/filebrowser from config
(pinned images, single source of truth; web filebrowser path delegates here).
stacks.EnsureBaseStack deploys the traefik-public network + the three stacks,
single-flight + idempotent + non-fatal; wired to first boot and every health
tick. monitor.EffectiveProtected drops cloudflared when no tunnel token.
Section-G fix lives in felhom-agent build-golden.sh (same-path stacks bind).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-11 14:56:42 +02:00
parent ba0e1eb04a
commit abbd9488c6
13 changed files with 873 additions and 111 deletions
+193
View File
@@ -0,0 +1,193 @@
package stacks
import (
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"gitea.dooplex.hu/admin/felhom-controller/internal/infra"
)
const traefikNetwork = "traefik-public"
// EnsureBaseStack renders and deploys the base-infrastructure stacks (traefik, cloudflared,
// filebrowser) the controller needs for routing + external access. It is:
// - single-flight: fired both at first boot and on every health tick; a TryLock ensures two
// invocations never race on the same stack dir / run concurrent `compose up` on the same stack.
// - idempotent: each stack is skipped when its container is already running, so the healthy-state
// re-run is a cheap 3× `docker inspect` no-op.
// - non-fatal by contract: returns a joined error for the caller to LOG; it must never crash the
// controller. cloudflared is only deployed when a tunnel token is configured (LAN-only nodes
// legitimately run without it — see the dynamic protected set in monitor.EffectiveProtected).
//
// Deploy order is load-bearing: traefik-public network → traefik → cloudflared → filebrowser (the
// composes declare the network `external: true`, so it must exist first).
func (m *Manager) EnsureBaseStack() error {
if !m.infraMu.TryLock() {
m.logger.Printf("[INFO] [infra] EnsureBaseStack already in progress — skipping this invocation")
return nil
}
defer m.infraMu.Unlock()
if err := m.ensureTraefikNetwork(); err != nil {
return fmt.Errorf("base-infra: %w", err) // without the network, every stack `up` fails
}
base := m.cfg.Paths.StacksDir
var errs []string
if err := m.ensureTraefik(filepath.Join(base, "traefik")); err != nil {
errs = append(errs, fmt.Sprintf("traefik: %v", err))
}
if m.cfg.Infrastructure.CFTunnelToken != "" {
if err := m.ensureCloudflared(filepath.Join(base, "cloudflared")); err != nil {
errs = append(errs, fmt.Sprintf("cloudflared: %v", err))
}
} else {
m.logger.Printf("[INFO] [infra] cloudflared skipped — no cf_tunnel_token configured (LAN-only node)")
}
if err := m.ensureFileBrowser(filepath.Join(base, "filebrowser")); err != nil {
errs = append(errs, fmt.Sprintf("filebrowser: %v", err))
}
if len(errs) > 0 {
return fmt.Errorf("base-infra bring-up: %s", strings.Join(errs, "; "))
}
return nil
}
func (m *Manager) ensureTraefik(dir string) error {
if containerRunning("traefik") {
return nil
}
m.logger.Printf("[INFO] [infra] deploying traefik → %s", dir)
if err := os.MkdirAll(filepath.Join(dir, "dynamic"), 0o755); err != nil {
return fmt.Errorf("mkdir dynamic: %w", err)
}
if err := os.MkdirAll(filepath.Join(dir, "certs"), 0o755); err != nil {
return fmt.Errorf("mkdir certs: %w", err)
}
// acme.json must exist as a 0600 file before traefik starts (it writes issued certs into it).
acme := filepath.Join(dir, "acme.json")
if _, err := os.Stat(acme); os.IsNotExist(err) {
if err := os.WriteFile(acme, []byte{}, 0o600); err != nil {
return fmt.Errorf("create acme.json: %w", err)
}
}
if err := os.Chmod(acme, 0o600); err != nil {
return fmt.Errorf("chmod acme.json: %w", err)
}
files, err := infra.RenderTraefik(infra.TraefikData{
ACMEEmail: m.cfg.Customer.Email,
CFAPIToken: m.cfg.Infrastructure.CFAPIToken,
})
if err != nil {
return err
}
if err := writeInfraFiles(dir, files); err != nil {
return err
}
return m.composeUp(dir)
}
func (m *Manager) ensureCloudflared(dir string) error {
if containerRunning("cloudflared") {
return nil
}
m.logger.Printf("[INFO] [infra] deploying cloudflared → %s", dir)
files, err := infra.RenderCloudflared(infra.CloudflaredData{CFTunnelToken: m.cfg.Infrastructure.CFTunnelToken})
if err != nil {
return err
}
if err := writeInfraFiles(dir, files); err != nil {
return err
}
return m.composeUp(dir)
}
func (m *Manager) ensureFileBrowser(dir string) error {
if containerRunning("filebrowser") {
return nil
}
composePath := filepath.Join(dir, "docker-compose.yml")
if _, err := os.Stat(composePath); err == nil {
// Already provisioned but not running — bring it up WITHOUT regenerating, so the storage
// mounts that web.SyncFileBrowserMounts manages are preserved. Just `compose up -d`.
m.logger.Printf("[INFO] [infra] filebrowser compose exists — starting without regenerating")
return m.composeUp(dir)
}
m.logger.Printf("[INFO] [infra] deploying filebrowser → %s", dir)
if err := os.MkdirAll(dir, 0o755); err != nil {
return fmt.Errorf("mkdir: %w", err)
}
// Initial render: no storage mounts yet (web.SyncFileBrowserMounts fills them in on the first
// storage-path change and owns all later regeneration).
compose := infra.RenderFileBrowserCompose(m.cfg.Customer.Domain, nil)
config := infra.RenderFileBrowserConfig(nil)
if err := os.WriteFile(composePath, []byte(compose), 0o644); err != nil {
return fmt.Errorf("write docker-compose.yml: %w", err)
}
if err := os.WriteFile(filepath.Join(dir, "config.yaml"), []byte(config), 0o644); err != nil {
return fmt.Errorf("write config.yaml: %w", err)
}
return m.composeUp(dir)
}
// ensureTraefikNetwork creates the external traefik-public docker network if absent (idempotent;
// tolerates a create/inspect race). Uses the docker CLI directly — it's a network op, not compose.
func (m *Manager) ensureTraefikNetwork() error {
if exec.Command("docker", "network", "inspect", traefikNetwork).Run() == nil {
return nil
}
m.logger.Printf("[INFO] [infra] creating docker network %s", traefikNetwork)
out, err := exec.Command("docker", "network", "create", traefikNetwork).CombinedOutput()
if err != nil {
// Tolerate a race where another actor created it between our inspect and create.
if exec.Command("docker", "network", "inspect", traefikNetwork).Run() == nil {
return nil
}
return fmt.Errorf("network create %s: %s: %w", traefikNetwork, strings.TrimSpace(string(out)), err)
}
return nil
}
// composeUp runs `docker compose up -d` in dir (DOMAIN injected by composeExecWithEnv).
func (m *Manager) composeUp(dir string) error {
out, err := m.composeExecWithEnv(dir, nil, "up", "-d")
if err != nil {
return fmt.Errorf("compose up: %s: %w", truncateStr(strings.TrimSpace(out), 300), err)
}
return nil
}
// writeInfraFiles writes each rendered file at its required mode (enforced via Chmod so an existing
// file — e.g. a re-rendered .env — keeps 0600 even though WriteFile only honors mode on create).
func writeInfraFiles(dir string, files map[string]infra.FileSpec) error {
if err := os.MkdirAll(dir, 0o755); err != nil {
return fmt.Errorf("mkdir: %w", err)
}
for name, spec := range files {
p := filepath.Join(dir, name)
if err := os.WriteFile(p, []byte(spec.Content), os.FileMode(spec.Mode)); err != nil {
return fmt.Errorf("write %s: %w", name, err)
}
if err := os.Chmod(p, os.FileMode(spec.Mode)); err != nil {
return fmt.Errorf("chmod %s: %w", name, err)
}
}
return nil
}
// containerRunning reports whether a container with the given name is currently running. It asks the
// daemon directly (works before the stack dir exists), mirroring monitor.checkProtectedContainers.
func containerRunning(name string) bool {
out, err := exec.Command("docker", "inspect", "--format", "{{.State.Running}}", name).Output()
if err != nil {
return false
}
return strings.TrimSpace(string(out)) == "true"
}