e61e7dd8fc
Empirically (staging on 9201): traefik v3 issues a cert from a router-level tls.domains but NOT from the entrypoint http.tls.domains. So the wildcard moves to RenderControllerRoute (the always-present anchor): when DNS-01 ACME is configured it carries tls.certResolver+domains *.<domain>+apex, and every other router serves that wildcard by SNI (no per-app labels). Reverts v0.42.0's dead entrypoint-domains + TraefikData.Domain. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
263 lines
11 KiB
Go
263 lines
11 KiB
Go
package stacks
|
||
|
||
import (
|
||
"fmt"
|
||
"os"
|
||
"os/exec"
|
||
"path/filepath"
|
||
"strings"
|
||
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/infra"
|
||
)
|
||
|
||
const traefikNetwork = "traefik-public"
|
||
|
||
// EnsureBaseStack renders and deploys the base-infrastructure stacks (traefik, cloudflared,
|
||
// filebrowser) the controller needs for routing + external access. It is:
|
||
// - single-flight: fired both at first boot and on every health tick; a TryLock ensures two
|
||
// invocations never race on the same stack dir / run concurrent `compose up` on the same stack.
|
||
// - idempotent: each stack is skipped when its container is already running, so the healthy-state
|
||
// re-run is a cheap 3× `docker inspect` no-op.
|
||
// - non-fatal by contract: returns a joined error for the caller to LOG; it must never crash the
|
||
// controller. cloudflared is only deployed when a tunnel token is configured (LAN-only nodes
|
||
// legitimately run without it — see the dynamic protected set in monitor.EffectiveProtected).
|
||
//
|
||
// Deploy order is load-bearing: traefik-public network → traefik → cloudflared → filebrowser (the
|
||
// composes declare the network `external: true`, so it must exist first).
|
||
func (m *Manager) EnsureBaseStack() error {
|
||
if !m.infraMu.TryLock() {
|
||
m.logger.Printf("[INFO] [infra] EnsureBaseStack already in progress — skipping this invocation")
|
||
return nil
|
||
}
|
||
defer m.infraMu.Unlock()
|
||
|
||
if err := m.ensureTraefikNetwork(); err != nil {
|
||
return fmt.Errorf("base-infra: %w", err) // without the network, every stack `up` fails
|
||
}
|
||
|
||
base := m.cfg.Paths.StacksDir
|
||
traefikDir := filepath.Join(base, "traefik")
|
||
var errs []string
|
||
|
||
if err := m.ensureTraefik(traefikDir); err != nil {
|
||
errs = append(errs, fmt.Sprintf("traefik: %v", err))
|
||
}
|
||
|
||
// Wire the controller's OWN dashboard route into traefik. Unlike filebrowser (which self-registers
|
||
// via Docker labels + network membership baked into its compose), the controller is started by the
|
||
// golden bootstrap before traefik-public exists and the v2 bootstrap carries no domain — so it can't
|
||
// self-label. We do it here, post-pull, where the domain is known: drop a file-provider route and
|
||
// join the controller to traefik-public so traefik can resolve felhom-controller:8080.
|
||
if err := m.wireController(traefikDir); err != nil {
|
||
errs = append(errs, fmt.Sprintf("controller-route: %v", err))
|
||
}
|
||
|
||
if m.cfg.Infrastructure.CFTunnelToken != "" {
|
||
if err := m.ensureCloudflared(filepath.Join(base, "cloudflared")); err != nil {
|
||
errs = append(errs, fmt.Sprintf("cloudflared: %v", err))
|
||
}
|
||
} else {
|
||
m.logger.Printf("[INFO] [infra] cloudflared skipped — no cf_tunnel_token configured (LAN-only node)")
|
||
}
|
||
|
||
if err := m.ensureFileBrowser(filepath.Join(base, "filebrowser")); err != nil {
|
||
errs = append(errs, fmt.Sprintf("filebrowser: %v", err))
|
||
}
|
||
|
||
if len(errs) > 0 {
|
||
return fmt.Errorf("base-infra bring-up: %s", strings.Join(errs, "; "))
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func (m *Manager) ensureTraefik(dir string) error {
|
||
if containerRunning("traefik") {
|
||
return nil
|
||
}
|
||
m.logger.Printf("[INFO] [infra] deploying traefik → %s", dir)
|
||
if err := os.MkdirAll(filepath.Join(dir, "dynamic"), 0o755); err != nil {
|
||
return fmt.Errorf("mkdir dynamic: %w", err)
|
||
}
|
||
if err := os.MkdirAll(filepath.Join(dir, "certs"), 0o755); err != nil {
|
||
return fmt.Errorf("mkdir certs: %w", err)
|
||
}
|
||
// acme.json must exist as a 0600 file before traefik starts (it writes issued certs into it).
|
||
acme := filepath.Join(dir, "acme.json")
|
||
if _, err := os.Stat(acme); os.IsNotExist(err) {
|
||
if err := os.WriteFile(acme, []byte{}, 0o600); err != nil {
|
||
return fmt.Errorf("create acme.json: %w", err)
|
||
}
|
||
}
|
||
if err := os.Chmod(acme, 0o600); err != nil {
|
||
return fmt.Errorf("chmod acme.json: %w", err)
|
||
}
|
||
files, err := infra.RenderTraefik(infra.TraefikData{
|
||
ACMEEmail: m.cfg.Customer.Email,
|
||
CFAPIToken: m.cfg.Infrastructure.CFAPIToken,
|
||
})
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if err := writeInfraFiles(dir, files); err != nil {
|
||
return err
|
||
}
|
||
return m.composeUp(dir)
|
||
}
|
||
|
||
func (m *Manager) ensureCloudflared(dir string) error {
|
||
if containerRunning("cloudflared") {
|
||
return nil
|
||
}
|
||
m.logger.Printf("[INFO] [infra] deploying cloudflared → %s", dir)
|
||
files, err := infra.RenderCloudflared(infra.CloudflaredData{CFTunnelToken: m.cfg.Infrastructure.CFTunnelToken})
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if err := writeInfraFiles(dir, files); err != nil {
|
||
return err
|
||
}
|
||
return m.composeUp(dir)
|
||
}
|
||
|
||
func (m *Manager) ensureFileBrowser(dir string) error {
|
||
if containerRunning("filebrowser") {
|
||
return nil
|
||
}
|
||
composePath := filepath.Join(dir, "docker-compose.yml")
|
||
if _, err := os.Stat(composePath); err == nil {
|
||
// Already provisioned but not running — bring it up WITHOUT regenerating, so the storage
|
||
// mounts that web.SyncFileBrowserMounts manages are preserved. Just `compose up -d`.
|
||
m.logger.Printf("[INFO] [infra] filebrowser compose exists — starting without regenerating")
|
||
return m.composeUp(dir)
|
||
}
|
||
m.logger.Printf("[INFO] [infra] deploying filebrowser → %s", dir)
|
||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||
return fmt.Errorf("mkdir: %w", err)
|
||
}
|
||
// Initial render: no storage mounts yet (web.SyncFileBrowserMounts fills them in on the first
|
||
// storage-path change and owns all later regeneration).
|
||
compose := infra.RenderFileBrowserCompose(m.cfg.Customer.Domain, nil)
|
||
config := infra.RenderFileBrowserConfig(nil)
|
||
if err := os.WriteFile(composePath, []byte(compose), 0o644); err != nil {
|
||
return fmt.Errorf("write docker-compose.yml: %w", err)
|
||
}
|
||
if err := os.WriteFile(filepath.Join(dir, "config.yaml"), []byte(config), 0o644); err != nil {
|
||
return fmt.Errorf("write config.yaml: %w", err)
|
||
}
|
||
return m.composeUp(dir)
|
||
}
|
||
|
||
// controllerContainer is the fixed name of the in-guest controller container (set by the golden
|
||
// bootstrap `docker run --name`). traefik resolves it by this name once both share traefik-public.
|
||
const controllerContainer = "felhom-controller"
|
||
|
||
// wireController makes the controller dashboard reachable through traefik: it writes the file-provider
|
||
// route (Host(felhom.<domain>) → http://felhom-controller:8080) and connects the controller container
|
||
// to traefik-public. Both are idempotent — the route is written only when its content changes (so the
|
||
// traefik file watcher doesn't reload every health tick), and the network connect is skipped when the
|
||
// controller is already attached. Domain is required (it comes from the hub pull); a missing domain is
|
||
// a no-op (logged) rather than an error.
|
||
func (m *Manager) wireController(traefikDir string) error {
|
||
domain := m.cfg.Customer.Domain
|
||
if domain == "" {
|
||
m.logger.Printf("[WARN] [infra] controller route skipped — no customer domain configured")
|
||
return nil
|
||
}
|
||
|
||
dynDir := filepath.Join(traefikDir, "dynamic")
|
||
if err := os.MkdirAll(dynDir, 0o755); err != nil {
|
||
return fmt.Errorf("mkdir dynamic: %w", err)
|
||
}
|
||
routePath := filepath.Join(dynDir, "controller.yml")
|
||
// DNS-01 ACME configured (CF token + email) → this route anchors wildcard proactive issuance.
|
||
wildcardTLS := m.cfg.Infrastructure.CFAPIToken != "" && m.cfg.Customer.Email != ""
|
||
want := infra.RenderControllerRoute(domain, wildcardTLS)
|
||
if cur, err := os.ReadFile(routePath); err != nil || string(cur) != want {
|
||
if err := os.WriteFile(routePath, []byte(want), 0o644); err != nil {
|
||
return fmt.Errorf("write controller route: %w", err)
|
||
}
|
||
m.logger.Printf("[INFO] [infra] wrote controller route → %s (Host felhom.%s → felhom-controller:8080)", routePath, domain)
|
||
}
|
||
|
||
if !containerOnNetwork(controllerContainer, traefikNetwork) {
|
||
out, err := exec.Command("docker", "network", "connect", traefikNetwork, controllerContainer).CombinedOutput()
|
||
if err != nil && !strings.Contains(string(out), "already exists") {
|
||
return fmt.Errorf("network connect %s: %s: %w", controllerContainer, strings.TrimSpace(string(out)), err)
|
||
}
|
||
m.logger.Printf("[INFO] [infra] connected %s to %s", controllerContainer, traefikNetwork)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// containerOnNetwork reports whether the named container is attached to the given docker network.
|
||
// We list the network names and match exactly — NOT `{{index .Networks "name"}}`, whose output for an
|
||
// absent key is "<nil>" (a non-empty string), which would falsely read as "already attached".
|
||
func containerOnNetwork(name, network string) bool {
|
||
out, err := exec.Command("docker", "inspect", "--format",
|
||
"{{range $k, $_ := .NetworkSettings.Networks}}{{$k}}\n{{end}}", name).Output()
|
||
if err != nil {
|
||
return false
|
||
}
|
||
for _, n := range strings.Fields(string(out)) {
|
||
if n == network {
|
||
return true
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
// ensureTraefikNetwork creates the external traefik-public docker network if absent (idempotent;
|
||
// tolerates a create/inspect race). Uses the docker CLI directly — it's a network op, not compose.
|
||
func (m *Manager) ensureTraefikNetwork() error {
|
||
if exec.Command("docker", "network", "inspect", traefikNetwork).Run() == nil {
|
||
return nil
|
||
}
|
||
m.logger.Printf("[INFO] [infra] creating docker network %s", traefikNetwork)
|
||
out, err := exec.Command("docker", "network", "create", traefikNetwork).CombinedOutput()
|
||
if err != nil {
|
||
// Tolerate a race where another actor created it between our inspect and create.
|
||
if exec.Command("docker", "network", "inspect", traefikNetwork).Run() == nil {
|
||
return nil
|
||
}
|
||
return fmt.Errorf("network create %s: %s: %w", traefikNetwork, strings.TrimSpace(string(out)), err)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// composeUp runs `docker compose up -d` in dir (DOMAIN injected by composeExecWithEnv).
|
||
func (m *Manager) composeUp(dir string) error {
|
||
out, err := m.composeExecWithEnv(dir, nil, "up", "-d")
|
||
if err != nil {
|
||
return fmt.Errorf("compose up: %s: %w", truncateStr(strings.TrimSpace(out), 300), err)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// writeInfraFiles writes each rendered file at its required mode (enforced via Chmod so an existing
|
||
// file — e.g. a re-rendered .env — keeps 0600 even though WriteFile only honors mode on create).
|
||
func writeInfraFiles(dir string, files map[string]infra.FileSpec) error {
|
||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||
return fmt.Errorf("mkdir: %w", err)
|
||
}
|
||
for name, spec := range files {
|
||
p := filepath.Join(dir, name)
|
||
if err := os.WriteFile(p, []byte(spec.Content), os.FileMode(spec.Mode)); err != nil {
|
||
return fmt.Errorf("write %s: %w", name, err)
|
||
}
|
||
if err := os.Chmod(p, os.FileMode(spec.Mode)); err != nil {
|
||
return fmt.Errorf("chmod %s: %w", name, err)
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// containerRunning reports whether a container with the given name is currently running. It asks the
|
||
// daemon directly (works before the stack dir exists), mirroring monitor.checkProtectedContainers.
|
||
func containerRunning(name string) bool {
|
||
out, err := exec.Command("docker", "inspect", "--format", "{{.State.Running}}", name).Output()
|
||
if err != nil {
|
||
return false
|
||
}
|
||
return strings.TrimSpace(string(out)) == "true"
|
||
}
|