Files
felhom-controller/controller/internal/stacks/infra.go
T
admin abbd9488c6 v0.41.0: first-boot base-infra bring-up + self-heal (+ Section-G mount fix)
New internal/infra package renders traefik/cloudflared/filebrowser from config
(pinned images, single source of truth; web filebrowser path delegates here).
stacks.EnsureBaseStack deploys the traefik-public network + the three stacks,
single-flight + idempotent + non-fatal; wired to first boot and every health
tick. monitor.EffectiveProtected drops cloudflared when no tunnel token.
Section-G fix lives in felhom-agent build-golden.sh (same-path stacks bind).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-11 14:56:42 +02:00

194 lines
7.2 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package stacks
import (
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"gitea.dooplex.hu/admin/felhom-controller/internal/infra"
)
const traefikNetwork = "traefik-public"
// EnsureBaseStack renders and deploys the base-infrastructure stacks (traefik, cloudflared,
// filebrowser) the controller needs for routing + external access. It is:
// - single-flight: fired both at first boot and on every health tick; a TryLock ensures two
// invocations never race on the same stack dir / run concurrent `compose up` on the same stack.
// - idempotent: each stack is skipped when its container is already running, so the healthy-state
// re-run is a cheap 3× `docker inspect` no-op.
// - non-fatal by contract: returns a joined error for the caller to LOG; it must never crash the
// controller. cloudflared is only deployed when a tunnel token is configured (LAN-only nodes
// legitimately run without it — see the dynamic protected set in monitor.EffectiveProtected).
//
// Deploy order is load-bearing: traefik-public network → traefik → cloudflared → filebrowser (the
// composes declare the network `external: true`, so it must exist first).
func (m *Manager) EnsureBaseStack() error {
if !m.infraMu.TryLock() {
m.logger.Printf("[INFO] [infra] EnsureBaseStack already in progress — skipping this invocation")
return nil
}
defer m.infraMu.Unlock()
if err := m.ensureTraefikNetwork(); err != nil {
return fmt.Errorf("base-infra: %w", err) // without the network, every stack `up` fails
}
base := m.cfg.Paths.StacksDir
var errs []string
if err := m.ensureTraefik(filepath.Join(base, "traefik")); err != nil {
errs = append(errs, fmt.Sprintf("traefik: %v", err))
}
if m.cfg.Infrastructure.CFTunnelToken != "" {
if err := m.ensureCloudflared(filepath.Join(base, "cloudflared")); err != nil {
errs = append(errs, fmt.Sprintf("cloudflared: %v", err))
}
} else {
m.logger.Printf("[INFO] [infra] cloudflared skipped — no cf_tunnel_token configured (LAN-only node)")
}
if err := m.ensureFileBrowser(filepath.Join(base, "filebrowser")); err != nil {
errs = append(errs, fmt.Sprintf("filebrowser: %v", err))
}
if len(errs) > 0 {
return fmt.Errorf("base-infra bring-up: %s", strings.Join(errs, "; "))
}
return nil
}
func (m *Manager) ensureTraefik(dir string) error {
if containerRunning("traefik") {
return nil
}
m.logger.Printf("[INFO] [infra] deploying traefik → %s", dir)
if err := os.MkdirAll(filepath.Join(dir, "dynamic"), 0o755); err != nil {
return fmt.Errorf("mkdir dynamic: %w", err)
}
if err := os.MkdirAll(filepath.Join(dir, "certs"), 0o755); err != nil {
return fmt.Errorf("mkdir certs: %w", err)
}
// acme.json must exist as a 0600 file before traefik starts (it writes issued certs into it).
acme := filepath.Join(dir, "acme.json")
if _, err := os.Stat(acme); os.IsNotExist(err) {
if err := os.WriteFile(acme, []byte{}, 0o600); err != nil {
return fmt.Errorf("create acme.json: %w", err)
}
}
if err := os.Chmod(acme, 0o600); err != nil {
return fmt.Errorf("chmod acme.json: %w", err)
}
files, err := infra.RenderTraefik(infra.TraefikData{
ACMEEmail: m.cfg.Customer.Email,
CFAPIToken: m.cfg.Infrastructure.CFAPIToken,
})
if err != nil {
return err
}
if err := writeInfraFiles(dir, files); err != nil {
return err
}
return m.composeUp(dir)
}
func (m *Manager) ensureCloudflared(dir string) error {
if containerRunning("cloudflared") {
return nil
}
m.logger.Printf("[INFO] [infra] deploying cloudflared → %s", dir)
files, err := infra.RenderCloudflared(infra.CloudflaredData{CFTunnelToken: m.cfg.Infrastructure.CFTunnelToken})
if err != nil {
return err
}
if err := writeInfraFiles(dir, files); err != nil {
return err
}
return m.composeUp(dir)
}
func (m *Manager) ensureFileBrowser(dir string) error {
if containerRunning("filebrowser") {
return nil
}
composePath := filepath.Join(dir, "docker-compose.yml")
if _, err := os.Stat(composePath); err == nil {
// Already provisioned but not running — bring it up WITHOUT regenerating, so the storage
// mounts that web.SyncFileBrowserMounts manages are preserved. Just `compose up -d`.
m.logger.Printf("[INFO] [infra] filebrowser compose exists — starting without regenerating")
return m.composeUp(dir)
}
m.logger.Printf("[INFO] [infra] deploying filebrowser → %s", dir)
if err := os.MkdirAll(dir, 0o755); err != nil {
return fmt.Errorf("mkdir: %w", err)
}
// Initial render: no storage mounts yet (web.SyncFileBrowserMounts fills them in on the first
// storage-path change and owns all later regeneration).
compose := infra.RenderFileBrowserCompose(m.cfg.Customer.Domain, nil)
config := infra.RenderFileBrowserConfig(nil)
if err := os.WriteFile(composePath, []byte(compose), 0o644); err != nil {
return fmt.Errorf("write docker-compose.yml: %w", err)
}
if err := os.WriteFile(filepath.Join(dir, "config.yaml"), []byte(config), 0o644); err != nil {
return fmt.Errorf("write config.yaml: %w", err)
}
return m.composeUp(dir)
}
// ensureTraefikNetwork creates the external traefik-public docker network if absent (idempotent;
// tolerates a create/inspect race). Uses the docker CLI directly — it's a network op, not compose.
func (m *Manager) ensureTraefikNetwork() error {
if exec.Command("docker", "network", "inspect", traefikNetwork).Run() == nil {
return nil
}
m.logger.Printf("[INFO] [infra] creating docker network %s", traefikNetwork)
out, err := exec.Command("docker", "network", "create", traefikNetwork).CombinedOutput()
if err != nil {
// Tolerate a race where another actor created it between our inspect and create.
if exec.Command("docker", "network", "inspect", traefikNetwork).Run() == nil {
return nil
}
return fmt.Errorf("network create %s: %s: %w", traefikNetwork, strings.TrimSpace(string(out)), err)
}
return nil
}
// composeUp runs `docker compose up -d` in dir (DOMAIN injected by composeExecWithEnv).
func (m *Manager) composeUp(dir string) error {
out, err := m.composeExecWithEnv(dir, nil, "up", "-d")
if err != nil {
return fmt.Errorf("compose up: %s: %w", truncateStr(strings.TrimSpace(out), 300), err)
}
return nil
}
// writeInfraFiles writes each rendered file at its required mode (enforced via Chmod so an existing
// file — e.g. a re-rendered .env — keeps 0600 even though WriteFile only honors mode on create).
func writeInfraFiles(dir string, files map[string]infra.FileSpec) error {
if err := os.MkdirAll(dir, 0o755); err != nil {
return fmt.Errorf("mkdir: %w", err)
}
for name, spec := range files {
p := filepath.Join(dir, name)
if err := os.WriteFile(p, []byte(spec.Content), os.FileMode(spec.Mode)); err != nil {
return fmt.Errorf("write %s: %w", name, err)
}
if err := os.Chmod(p, os.FileMode(spec.Mode)); err != nil {
return fmt.Errorf("chmod %s: %w", name, err)
}
}
return nil
}
// containerRunning reports whether a container with the given name is currently running. It asks the
// daemon directly (works before the stack dir exists), mirroring monitor.checkProtectedContainers.
func containerRunning(name string) bool {
out, err := exec.Command("docker", "inspect", "--format", "{{.State.Running}}", name).Output()
if err != nil {
return false
}
return strings.TrimSpace(string(out)) == "true"
}