package stacks import ( "fmt" "os" "os/exec" "path/filepath" "strings" "gitea.dooplex.hu/admin/felhom-controller/internal/infra" ) const traefikNetwork = "traefik-public" // EnsureBaseStack renders and deploys the base-infrastructure stacks (traefik, cloudflared, // filebrowser) the controller needs for routing + external access. It is: // - single-flight: fired both at first boot and on every health tick; a TryLock ensures two // invocations never race on the same stack dir / run concurrent `compose up` on the same stack. // - idempotent: each stack is skipped when its container is already running, so the healthy-state // re-run is a cheap 3× `docker inspect` no-op. // - non-fatal by contract: returns a joined error for the caller to LOG; it must never crash the // controller. cloudflared is only deployed when a tunnel token is configured (LAN-only nodes // legitimately run without it — see the dynamic protected set in monitor.EffectiveProtected). // // Deploy order is load-bearing: traefik-public network → traefik → cloudflared → filebrowser (the // composes declare the network `external: true`, so it must exist first). func (m *Manager) EnsureBaseStack() error { if !m.infraMu.TryLock() { m.logger.Printf("[INFO] [infra] EnsureBaseStack already in progress — skipping this invocation") return nil } defer m.infraMu.Unlock() if err := m.ensureTraefikNetwork(); err != nil { return fmt.Errorf("base-infra: %w", err) // without the network, every stack `up` fails } base := m.cfg.Paths.StacksDir traefikDir := filepath.Join(base, "traefik") var errs []string if err := m.ensureTraefik(traefikDir); err != nil { errs = append(errs, fmt.Sprintf("traefik: %v", err)) } // Wire the controller's OWN dashboard route into traefik. Unlike filebrowser (which self-registers // via Docker labels + network membership baked into its compose), the controller is started by the // golden bootstrap before traefik-public exists and the v2 bootstrap carries no domain — so it can't // self-label. We do it here, post-pull, where the domain is known: drop a file-provider route and // join the controller to traefik-public so traefik can resolve felhom-controller:8080. if err := m.wireController(traefikDir); err != nil { errs = append(errs, fmt.Sprintf("controller-route: %v", err)) } if m.cfg.Infrastructure.CFTunnelToken != "" { if err := m.ensureCloudflared(filepath.Join(base, "cloudflared")); err != nil { errs = append(errs, fmt.Sprintf("cloudflared: %v", err)) } } else { m.logger.Printf("[INFO] [infra] cloudflared skipped — no cf_tunnel_token configured (LAN-only node)") } if err := m.ensureFileBrowser(filepath.Join(base, "filebrowser")); err != nil { errs = append(errs, fmt.Sprintf("filebrowser: %v", err)) } if len(errs) > 0 { return fmt.Errorf("base-infra bring-up: %s", strings.Join(errs, "; ")) } return nil } func (m *Manager) ensureTraefik(dir string) error { if containerRunning("traefik") { return nil } m.logger.Printf("[INFO] [infra] deploying traefik → %s", dir) if err := os.MkdirAll(filepath.Join(dir, "dynamic"), 0o755); err != nil { return fmt.Errorf("mkdir dynamic: %w", err) } if err := os.MkdirAll(filepath.Join(dir, "certs"), 0o755); err != nil { return fmt.Errorf("mkdir certs: %w", err) } // acme.json must exist as a 0600 file before traefik starts (it writes issued certs into it). acme := filepath.Join(dir, "acme.json") if _, err := os.Stat(acme); os.IsNotExist(err) { if err := os.WriteFile(acme, []byte{}, 0o600); err != nil { return fmt.Errorf("create acme.json: %w", err) } } if err := os.Chmod(acme, 0o600); err != nil { return fmt.Errorf("chmod acme.json: %w", err) } files, err := infra.RenderTraefik(infra.TraefikData{ ACMEEmail: m.cfg.Customer.Email, CFAPIToken: m.cfg.Infrastructure.CFAPIToken, }) if err != nil { return err } if err := writeInfraFiles(dir, files); err != nil { return err } return m.composeUp(dir) } func (m *Manager) ensureCloudflared(dir string) error { if containerRunning("cloudflared") { return nil } m.logger.Printf("[INFO] [infra] deploying cloudflared → %s", dir) files, err := infra.RenderCloudflared(infra.CloudflaredData{CFTunnelToken: m.cfg.Infrastructure.CFTunnelToken}) if err != nil { return err } if err := writeInfraFiles(dir, files); err != nil { return err } return m.composeUp(dir) } func (m *Manager) ensureFileBrowser(dir string) error { if containerRunning("filebrowser") { return nil } composePath := filepath.Join(dir, "docker-compose.yml") if _, err := os.Stat(composePath); err == nil { // Already provisioned but not running — bring it up WITHOUT regenerating, so the storage // mounts that web.SyncFileBrowserMounts manages are preserved. Just `compose up -d`. m.logger.Printf("[INFO] [infra] filebrowser compose exists — starting without regenerating") return m.composeUp(dir) } m.logger.Printf("[INFO] [infra] deploying filebrowser → %s", dir) if err := os.MkdirAll(dir, 0o755); err != nil { return fmt.Errorf("mkdir: %w", err) } // Initial render: no storage mounts yet (web.SyncFileBrowserMounts fills them in on the first // storage-path change and owns all later regeneration). compose := infra.RenderFileBrowserCompose(m.cfg.Customer.Domain, nil) config := infra.RenderFileBrowserConfig(nil) if err := os.WriteFile(composePath, []byte(compose), 0o644); err != nil { return fmt.Errorf("write docker-compose.yml: %w", err) } if err := os.WriteFile(filepath.Join(dir, "config.yaml"), []byte(config), 0o644); err != nil { return fmt.Errorf("write config.yaml: %w", err) } return m.composeUp(dir) } // controllerContainer is the fixed name of the in-guest controller container (set by the golden // bootstrap `docker run --name`). traefik resolves it by this name once both share traefik-public. const controllerContainer = "felhom-controller" // wireController makes the controller dashboard reachable through traefik: it writes the file-provider // route (Host(felhom.) → http://felhom-controller:8080) and connects the controller container // to traefik-public. Both are idempotent — the route is written only when its content changes (so the // traefik file watcher doesn't reload every health tick), and the network connect is skipped when the // controller is already attached. Domain is required (it comes from the hub pull); a missing domain is // a no-op (logged) rather than an error. func (m *Manager) wireController(traefikDir string) error { domain := m.cfg.Customer.Domain if domain == "" { m.logger.Printf("[WARN] [infra] controller route skipped — no customer domain configured") return nil } dynDir := filepath.Join(traefikDir, "dynamic") if err := os.MkdirAll(dynDir, 0o755); err != nil { return fmt.Errorf("mkdir dynamic: %w", err) } routePath := filepath.Join(dynDir, "controller.yml") // DNS-01 ACME configured (CF token + email) → this route anchors wildcard proactive issuance. wildcardTLS := m.cfg.Infrastructure.CFAPIToken != "" && m.cfg.Customer.Email != "" want := infra.RenderControllerRoute(domain, wildcardTLS) if cur, err := os.ReadFile(routePath); err != nil || string(cur) != want { if err := os.WriteFile(routePath, []byte(want), 0o644); err != nil { return fmt.Errorf("write controller route: %w", err) } m.logger.Printf("[INFO] [infra] wrote controller route → %s (Host felhom.%s → felhom-controller:8080)", routePath, domain) } if !containerOnNetwork(controllerContainer, traefikNetwork) { out, err := exec.Command("docker", "network", "connect", traefikNetwork, controllerContainer).CombinedOutput() if err != nil && !strings.Contains(string(out), "already exists") { return fmt.Errorf("network connect %s: %s: %w", controllerContainer, strings.TrimSpace(string(out)), err) } m.logger.Printf("[INFO] [infra] connected %s to %s", controllerContainer, traefikNetwork) } return nil } // containerOnNetwork reports whether the named container is attached to the given docker network. // We list the network names and match exactly — NOT `{{index .Networks "name"}}`, whose output for an // absent key is "" (a non-empty string), which would falsely read as "already attached". func containerOnNetwork(name, network string) bool { out, err := exec.Command("docker", "inspect", "--format", "{{range $k, $_ := .NetworkSettings.Networks}}{{$k}}\n{{end}}", name).Output() if err != nil { return false } for _, n := range strings.Fields(string(out)) { if n == network { return true } } return false } // ensureTraefikNetwork creates the external traefik-public docker network if absent (idempotent; // tolerates a create/inspect race). Uses the docker CLI directly — it's a network op, not compose. func (m *Manager) ensureTraefikNetwork() error { if exec.Command("docker", "network", "inspect", traefikNetwork).Run() == nil { return nil } m.logger.Printf("[INFO] [infra] creating docker network %s", traefikNetwork) out, err := exec.Command("docker", "network", "create", traefikNetwork).CombinedOutput() if err != nil { // Tolerate a race where another actor created it between our inspect and create. if exec.Command("docker", "network", "inspect", traefikNetwork).Run() == nil { return nil } return fmt.Errorf("network create %s: %s: %w", traefikNetwork, strings.TrimSpace(string(out)), err) } return nil } // composeUp runs `docker compose up -d` in dir (DOMAIN injected by composeExecWithEnv). func (m *Manager) composeUp(dir string) error { out, err := m.composeExecWithEnv(dir, nil, "up", "-d") if err != nil { return fmt.Errorf("compose up: %s: %w", truncateStr(strings.TrimSpace(out), 300), err) } return nil } // writeInfraFiles writes each rendered file at its required mode (enforced via Chmod so an existing // file — e.g. a re-rendered .env — keeps 0600 even though WriteFile only honors mode on create). func writeInfraFiles(dir string, files map[string]infra.FileSpec) error { if err := os.MkdirAll(dir, 0o755); err != nil { return fmt.Errorf("mkdir: %w", err) } for name, spec := range files { p := filepath.Join(dir, name) if err := os.WriteFile(p, []byte(spec.Content), os.FileMode(spec.Mode)); err != nil { return fmt.Errorf("write %s: %w", name, err) } if err := os.Chmod(p, os.FileMode(spec.Mode)); err != nil { return fmt.Errorf("chmod %s: %w", name, err) } } return nil } // containerRunning reports whether a container with the given name is currently running. It asks the // daemon directly (works before the stack dir exists), mirroring monitor.checkProtectedContainers. func containerRunning(name string) bool { out, err := exec.Command("docker", "inspect", "--format", "{{.State.Running}}", name).Output() if err != nil { return false } return strings.TrimSpace(string(out)) == "true" }