Files
felhom-controller/controller/internal/stacks/infra.go
T
admin 91736eb015 v0.41.1: wire the controller dashboard into traefik (felhom.<domain> routing)
EnsureBaseStack now writes a traefik file-provider route
(Host(felhom.<domain>) -> http://felhom-controller:8080) and joins the
controller to traefik-public. Done post-pull (domain known) and idempotently
(write-if-changed + skip-if-connected), so felhom.<domain> reaches the
controller. Completes the v0.41.0 base-infra bring-up.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-11 15:40:43 +02:00

255 lines
10 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package stacks
import (
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"gitea.dooplex.hu/admin/felhom-controller/internal/infra"
)
const traefikNetwork = "traefik-public"
// EnsureBaseStack renders and deploys the base-infrastructure stacks (traefik, cloudflared,
// filebrowser) the controller needs for routing + external access. It is:
// - single-flight: fired both at first boot and on every health tick; a TryLock ensures two
// invocations never race on the same stack dir / run concurrent `compose up` on the same stack.
// - idempotent: each stack is skipped when its container is already running, so the healthy-state
// re-run is a cheap 3× `docker inspect` no-op.
// - non-fatal by contract: returns a joined error for the caller to LOG; it must never crash the
// controller. cloudflared is only deployed when a tunnel token is configured (LAN-only nodes
// legitimately run without it — see the dynamic protected set in monitor.EffectiveProtected).
//
// Deploy order is load-bearing: traefik-public network → traefik → cloudflared → filebrowser (the
// composes declare the network `external: true`, so it must exist first).
func (m *Manager) EnsureBaseStack() error {
if !m.infraMu.TryLock() {
m.logger.Printf("[INFO] [infra] EnsureBaseStack already in progress — skipping this invocation")
return nil
}
defer m.infraMu.Unlock()
if err := m.ensureTraefikNetwork(); err != nil {
return fmt.Errorf("base-infra: %w", err) // without the network, every stack `up` fails
}
base := m.cfg.Paths.StacksDir
traefikDir := filepath.Join(base, "traefik")
var errs []string
if err := m.ensureTraefik(traefikDir); err != nil {
errs = append(errs, fmt.Sprintf("traefik: %v", err))
}
// Wire the controller's OWN dashboard route into traefik. Unlike filebrowser (which self-registers
// via Docker labels + network membership baked into its compose), the controller is started by the
// golden bootstrap before traefik-public exists and the v2 bootstrap carries no domain — so it can't
// self-label. We do it here, post-pull, where the domain is known: drop a file-provider route and
// join the controller to traefik-public so traefik can resolve felhom-controller:8080.
if err := m.wireController(traefikDir); err != nil {
errs = append(errs, fmt.Sprintf("controller-route: %v", err))
}
if m.cfg.Infrastructure.CFTunnelToken != "" {
if err := m.ensureCloudflared(filepath.Join(base, "cloudflared")); err != nil {
errs = append(errs, fmt.Sprintf("cloudflared: %v", err))
}
} else {
m.logger.Printf("[INFO] [infra] cloudflared skipped — no cf_tunnel_token configured (LAN-only node)")
}
if err := m.ensureFileBrowser(filepath.Join(base, "filebrowser")); err != nil {
errs = append(errs, fmt.Sprintf("filebrowser: %v", err))
}
if len(errs) > 0 {
return fmt.Errorf("base-infra bring-up: %s", strings.Join(errs, "; "))
}
return nil
}
func (m *Manager) ensureTraefik(dir string) error {
if containerRunning("traefik") {
return nil
}
m.logger.Printf("[INFO] [infra] deploying traefik → %s", dir)
if err := os.MkdirAll(filepath.Join(dir, "dynamic"), 0o755); err != nil {
return fmt.Errorf("mkdir dynamic: %w", err)
}
if err := os.MkdirAll(filepath.Join(dir, "certs"), 0o755); err != nil {
return fmt.Errorf("mkdir certs: %w", err)
}
// acme.json must exist as a 0600 file before traefik starts (it writes issued certs into it).
acme := filepath.Join(dir, "acme.json")
if _, err := os.Stat(acme); os.IsNotExist(err) {
if err := os.WriteFile(acme, []byte{}, 0o600); err != nil {
return fmt.Errorf("create acme.json: %w", err)
}
}
if err := os.Chmod(acme, 0o600); err != nil {
return fmt.Errorf("chmod acme.json: %w", err)
}
files, err := infra.RenderTraefik(infra.TraefikData{
ACMEEmail: m.cfg.Customer.Email,
CFAPIToken: m.cfg.Infrastructure.CFAPIToken,
})
if err != nil {
return err
}
if err := writeInfraFiles(dir, files); err != nil {
return err
}
return m.composeUp(dir)
}
func (m *Manager) ensureCloudflared(dir string) error {
if containerRunning("cloudflared") {
return nil
}
m.logger.Printf("[INFO] [infra] deploying cloudflared → %s", dir)
files, err := infra.RenderCloudflared(infra.CloudflaredData{CFTunnelToken: m.cfg.Infrastructure.CFTunnelToken})
if err != nil {
return err
}
if err := writeInfraFiles(dir, files); err != nil {
return err
}
return m.composeUp(dir)
}
func (m *Manager) ensureFileBrowser(dir string) error {
if containerRunning("filebrowser") {
return nil
}
composePath := filepath.Join(dir, "docker-compose.yml")
if _, err := os.Stat(composePath); err == nil {
// Already provisioned but not running — bring it up WITHOUT regenerating, so the storage
// mounts that web.SyncFileBrowserMounts manages are preserved. Just `compose up -d`.
m.logger.Printf("[INFO] [infra] filebrowser compose exists — starting without regenerating")
return m.composeUp(dir)
}
m.logger.Printf("[INFO] [infra] deploying filebrowser → %s", dir)
if err := os.MkdirAll(dir, 0o755); err != nil {
return fmt.Errorf("mkdir: %w", err)
}
// Initial render: no storage mounts yet (web.SyncFileBrowserMounts fills them in on the first
// storage-path change and owns all later regeneration).
compose := infra.RenderFileBrowserCompose(m.cfg.Customer.Domain, nil)
config := infra.RenderFileBrowserConfig(nil)
if err := os.WriteFile(composePath, []byte(compose), 0o644); err != nil {
return fmt.Errorf("write docker-compose.yml: %w", err)
}
if err := os.WriteFile(filepath.Join(dir, "config.yaml"), []byte(config), 0o644); err != nil {
return fmt.Errorf("write config.yaml: %w", err)
}
return m.composeUp(dir)
}
// controllerContainer is the fixed name of the in-guest controller container (set by the golden
// bootstrap `docker run --name`). traefik resolves it by this name once both share traefik-public.
const controllerContainer = "felhom-controller"
// wireController makes the controller dashboard reachable through traefik: it writes the file-provider
// route (Host(felhom.<domain>) → http://felhom-controller:8080) and connects the controller container
// to traefik-public. Both are idempotent — the route is written only when its content changes (so the
// traefik file watcher doesn't reload every health tick), and the network connect is skipped when the
// controller is already attached. Domain is required (it comes from the hub pull); a missing domain is
// a no-op (logged) rather than an error.
func (m *Manager) wireController(traefikDir string) error {
domain := m.cfg.Customer.Domain
if domain == "" {
m.logger.Printf("[WARN] [infra] controller route skipped — no customer domain configured")
return nil
}
dynDir := filepath.Join(traefikDir, "dynamic")
if err := os.MkdirAll(dynDir, 0o755); err != nil {
return fmt.Errorf("mkdir dynamic: %w", err)
}
routePath := filepath.Join(dynDir, "controller.yml")
want := infra.RenderControllerRoute(domain)
if cur, err := os.ReadFile(routePath); err != nil || string(cur) != want {
if err := os.WriteFile(routePath, []byte(want), 0o644); err != nil {
return fmt.Errorf("write controller route: %w", err)
}
m.logger.Printf("[INFO] [infra] wrote controller route → %s (Host felhom.%s → felhom-controller:8080)", routePath, domain)
}
if !containerOnNetwork(controllerContainer, traefikNetwork) {
out, err := exec.Command("docker", "network", "connect", traefikNetwork, controllerContainer).CombinedOutput()
if err != nil && !strings.Contains(string(out), "already exists") {
return fmt.Errorf("network connect %s: %s: %w", controllerContainer, strings.TrimSpace(string(out)), err)
}
m.logger.Printf("[INFO] [infra] connected %s to %s", controllerContainer, traefikNetwork)
}
return nil
}
// containerOnNetwork reports whether the named container is attached to the given docker network.
func containerOnNetwork(name, network string) bool {
out, err := exec.Command("docker", "inspect", "--format",
fmt.Sprintf("{{index .NetworkSettings.Networks %q}}", network), name).Output()
if err != nil {
return false
}
s := strings.TrimSpace(string(out))
return s != "" && s != "<no value>"
}
// ensureTraefikNetwork creates the external traefik-public docker network if absent (idempotent;
// tolerates a create/inspect race). Uses the docker CLI directly — it's a network op, not compose.
func (m *Manager) ensureTraefikNetwork() error {
if exec.Command("docker", "network", "inspect", traefikNetwork).Run() == nil {
return nil
}
m.logger.Printf("[INFO] [infra] creating docker network %s", traefikNetwork)
out, err := exec.Command("docker", "network", "create", traefikNetwork).CombinedOutput()
if err != nil {
// Tolerate a race where another actor created it between our inspect and create.
if exec.Command("docker", "network", "inspect", traefikNetwork).Run() == nil {
return nil
}
return fmt.Errorf("network create %s: %s: %w", traefikNetwork, strings.TrimSpace(string(out)), err)
}
return nil
}
// composeUp runs `docker compose up -d` in dir (DOMAIN injected by composeExecWithEnv).
func (m *Manager) composeUp(dir string) error {
out, err := m.composeExecWithEnv(dir, nil, "up", "-d")
if err != nil {
return fmt.Errorf("compose up: %s: %w", truncateStr(strings.TrimSpace(out), 300), err)
}
return nil
}
// writeInfraFiles writes each rendered file at its required mode (enforced via Chmod so an existing
// file — e.g. a re-rendered .env — keeps 0600 even though WriteFile only honors mode on create).
func writeInfraFiles(dir string, files map[string]infra.FileSpec) error {
if err := os.MkdirAll(dir, 0o755); err != nil {
return fmt.Errorf("mkdir: %w", err)
}
for name, spec := range files {
p := filepath.Join(dir, name)
if err := os.WriteFile(p, []byte(spec.Content), os.FileMode(spec.Mode)); err != nil {
return fmt.Errorf("write %s: %w", name, err)
}
if err := os.Chmod(p, os.FileMode(spec.Mode)); err != nil {
return fmt.Errorf("chmod %s: %w", name, err)
}
}
return nil
}
// containerRunning reports whether a container with the given name is currently running. It asks the
// daemon directly (works before the stack dir exists), mirroring monitor.checkProtectedContainers.
func containerRunning(name string) bool {
out, err := exec.Command("docker", "inspect", "--format", "{{.State.Running}}", name).Output()
if err != nil {
return false
}
return strings.TrimSpace(string(out)) == "true"
}