abbd9488c6
New internal/infra package renders traefik/cloudflared/filebrowser from config (pinned images, single source of truth; web filebrowser path delegates here). stacks.EnsureBaseStack deploys the traefik-public network + the three stacks, single-flight + idempotent + non-fatal; wired to first boot and every health tick. monitor.EffectiveProtected drops cloudflared when no tunnel token. Section-G fix lives in felhom-agent build-golden.sh (same-path stacks bind). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
194 lines
7.2 KiB
Go
194 lines
7.2 KiB
Go
package stacks
|
||
|
||
import (
|
||
"fmt"
|
||
"os"
|
||
"os/exec"
|
||
"path/filepath"
|
||
"strings"
|
||
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/infra"
|
||
)
|
||
|
||
const traefikNetwork = "traefik-public"
|
||
|
||
// EnsureBaseStack renders and deploys the base-infrastructure stacks (traefik, cloudflared,
|
||
// filebrowser) the controller needs for routing + external access. It is:
|
||
// - single-flight: fired both at first boot and on every health tick; a TryLock ensures two
|
||
// invocations never race on the same stack dir / run concurrent `compose up` on the same stack.
|
||
// - idempotent: each stack is skipped when its container is already running, so the healthy-state
|
||
// re-run is a cheap 3× `docker inspect` no-op.
|
||
// - non-fatal by contract: returns a joined error for the caller to LOG; it must never crash the
|
||
// controller. cloudflared is only deployed when a tunnel token is configured (LAN-only nodes
|
||
// legitimately run without it — see the dynamic protected set in monitor.EffectiveProtected).
|
||
//
|
||
// Deploy order is load-bearing: traefik-public network → traefik → cloudflared → filebrowser (the
|
||
// composes declare the network `external: true`, so it must exist first).
|
||
func (m *Manager) EnsureBaseStack() error {
|
||
if !m.infraMu.TryLock() {
|
||
m.logger.Printf("[INFO] [infra] EnsureBaseStack already in progress — skipping this invocation")
|
||
return nil
|
||
}
|
||
defer m.infraMu.Unlock()
|
||
|
||
if err := m.ensureTraefikNetwork(); err != nil {
|
||
return fmt.Errorf("base-infra: %w", err) // without the network, every stack `up` fails
|
||
}
|
||
|
||
base := m.cfg.Paths.StacksDir
|
||
var errs []string
|
||
|
||
if err := m.ensureTraefik(filepath.Join(base, "traefik")); err != nil {
|
||
errs = append(errs, fmt.Sprintf("traefik: %v", err))
|
||
}
|
||
|
||
if m.cfg.Infrastructure.CFTunnelToken != "" {
|
||
if err := m.ensureCloudflared(filepath.Join(base, "cloudflared")); err != nil {
|
||
errs = append(errs, fmt.Sprintf("cloudflared: %v", err))
|
||
}
|
||
} else {
|
||
m.logger.Printf("[INFO] [infra] cloudflared skipped — no cf_tunnel_token configured (LAN-only node)")
|
||
}
|
||
|
||
if err := m.ensureFileBrowser(filepath.Join(base, "filebrowser")); err != nil {
|
||
errs = append(errs, fmt.Sprintf("filebrowser: %v", err))
|
||
}
|
||
|
||
if len(errs) > 0 {
|
||
return fmt.Errorf("base-infra bring-up: %s", strings.Join(errs, "; "))
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func (m *Manager) ensureTraefik(dir string) error {
|
||
if containerRunning("traefik") {
|
||
return nil
|
||
}
|
||
m.logger.Printf("[INFO] [infra] deploying traefik → %s", dir)
|
||
if err := os.MkdirAll(filepath.Join(dir, "dynamic"), 0o755); err != nil {
|
||
return fmt.Errorf("mkdir dynamic: %w", err)
|
||
}
|
||
if err := os.MkdirAll(filepath.Join(dir, "certs"), 0o755); err != nil {
|
||
return fmt.Errorf("mkdir certs: %w", err)
|
||
}
|
||
// acme.json must exist as a 0600 file before traefik starts (it writes issued certs into it).
|
||
acme := filepath.Join(dir, "acme.json")
|
||
if _, err := os.Stat(acme); os.IsNotExist(err) {
|
||
if err := os.WriteFile(acme, []byte{}, 0o600); err != nil {
|
||
return fmt.Errorf("create acme.json: %w", err)
|
||
}
|
||
}
|
||
if err := os.Chmod(acme, 0o600); err != nil {
|
||
return fmt.Errorf("chmod acme.json: %w", err)
|
||
}
|
||
files, err := infra.RenderTraefik(infra.TraefikData{
|
||
ACMEEmail: m.cfg.Customer.Email,
|
||
CFAPIToken: m.cfg.Infrastructure.CFAPIToken,
|
||
})
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if err := writeInfraFiles(dir, files); err != nil {
|
||
return err
|
||
}
|
||
return m.composeUp(dir)
|
||
}
|
||
|
||
func (m *Manager) ensureCloudflared(dir string) error {
|
||
if containerRunning("cloudflared") {
|
||
return nil
|
||
}
|
||
m.logger.Printf("[INFO] [infra] deploying cloudflared → %s", dir)
|
||
files, err := infra.RenderCloudflared(infra.CloudflaredData{CFTunnelToken: m.cfg.Infrastructure.CFTunnelToken})
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if err := writeInfraFiles(dir, files); err != nil {
|
||
return err
|
||
}
|
||
return m.composeUp(dir)
|
||
}
|
||
|
||
func (m *Manager) ensureFileBrowser(dir string) error {
|
||
if containerRunning("filebrowser") {
|
||
return nil
|
||
}
|
||
composePath := filepath.Join(dir, "docker-compose.yml")
|
||
if _, err := os.Stat(composePath); err == nil {
|
||
// Already provisioned but not running — bring it up WITHOUT regenerating, so the storage
|
||
// mounts that web.SyncFileBrowserMounts manages are preserved. Just `compose up -d`.
|
||
m.logger.Printf("[INFO] [infra] filebrowser compose exists — starting without regenerating")
|
||
return m.composeUp(dir)
|
||
}
|
||
m.logger.Printf("[INFO] [infra] deploying filebrowser → %s", dir)
|
||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||
return fmt.Errorf("mkdir: %w", err)
|
||
}
|
||
// Initial render: no storage mounts yet (web.SyncFileBrowserMounts fills them in on the first
|
||
// storage-path change and owns all later regeneration).
|
||
compose := infra.RenderFileBrowserCompose(m.cfg.Customer.Domain, nil)
|
||
config := infra.RenderFileBrowserConfig(nil)
|
||
if err := os.WriteFile(composePath, []byte(compose), 0o644); err != nil {
|
||
return fmt.Errorf("write docker-compose.yml: %w", err)
|
||
}
|
||
if err := os.WriteFile(filepath.Join(dir, "config.yaml"), []byte(config), 0o644); err != nil {
|
||
return fmt.Errorf("write config.yaml: %w", err)
|
||
}
|
||
return m.composeUp(dir)
|
||
}
|
||
|
||
// ensureTraefikNetwork creates the external traefik-public docker network if absent (idempotent;
|
||
// tolerates a create/inspect race). Uses the docker CLI directly — it's a network op, not compose.
|
||
func (m *Manager) ensureTraefikNetwork() error {
|
||
if exec.Command("docker", "network", "inspect", traefikNetwork).Run() == nil {
|
||
return nil
|
||
}
|
||
m.logger.Printf("[INFO] [infra] creating docker network %s", traefikNetwork)
|
||
out, err := exec.Command("docker", "network", "create", traefikNetwork).CombinedOutput()
|
||
if err != nil {
|
||
// Tolerate a race where another actor created it between our inspect and create.
|
||
if exec.Command("docker", "network", "inspect", traefikNetwork).Run() == nil {
|
||
return nil
|
||
}
|
||
return fmt.Errorf("network create %s: %s: %w", traefikNetwork, strings.TrimSpace(string(out)), err)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// composeUp runs `docker compose up -d` in dir (DOMAIN injected by composeExecWithEnv).
|
||
func (m *Manager) composeUp(dir string) error {
|
||
out, err := m.composeExecWithEnv(dir, nil, "up", "-d")
|
||
if err != nil {
|
||
return fmt.Errorf("compose up: %s: %w", truncateStr(strings.TrimSpace(out), 300), err)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// writeInfraFiles writes each rendered file at its required mode (enforced via Chmod so an existing
|
||
// file — e.g. a re-rendered .env — keeps 0600 even though WriteFile only honors mode on create).
|
||
func writeInfraFiles(dir string, files map[string]infra.FileSpec) error {
|
||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||
return fmt.Errorf("mkdir: %w", err)
|
||
}
|
||
for name, spec := range files {
|
||
p := filepath.Join(dir, name)
|
||
if err := os.WriteFile(p, []byte(spec.Content), os.FileMode(spec.Mode)); err != nil {
|
||
return fmt.Errorf("write %s: %w", name, err)
|
||
}
|
||
if err := os.Chmod(p, os.FileMode(spec.Mode)); err != nil {
|
||
return fmt.Errorf("chmod %s: %w", name, err)
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// containerRunning reports whether a container with the given name is currently running. It asks the
|
||
// daemon directly (works before the stack dir exists), mirroring monitor.checkProtectedContainers.
|
||
func containerRunning(name string) bool {
|
||
out, err := exec.Command("docker", "inspect", "--format", "{{.State.Running}}", name).Output()
|
||
if err != nil {
|
||
return false
|
||
}
|
||
return strings.TrimSpace(string(out)) == "true"
|
||
}
|