diff --git a/CHANGELOG.md b/CHANGELOG.md index 30ac1a8..0253a18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,41 @@ ## Changelog +### v0.41.0 — first-boot base-infrastructure bring-up + self-heal (+ Section-G mount fix) (2026-06-11) + +Lockstep with `felhom-agent` v0.20.0 + a golden rebake. A freshly-onboarded controller came up ONLINE +but **Health = FAIL: protected containers not running — traefik, cloudflared, filebrowser**: nothing +ever deployed the base stack on a Proxmox bootstrap (it was only ever created by the bare-metal +`scripts/docker-setup.sh`), and the health loop only *detected* the gap. This release makes the +controller stand up its own base infrastructure. + +- **New `internal/infra` package** — pure renderers (`//go:embed` templates lifted verbatim from + `scripts/docker-setup.sh`) for traefik (`traefik.yml` + compose + a 0600 `.env` carrying the CF DNS + token only when set), cloudflared (compose; `TUNNEL_TOKEN`), and filebrowser (compose + `config.yaml`). + **Image tags are PINNED here as the single source of truth** — `traefik:v3.6.7`, + `cloudflare/cloudflared:2026.6.0`, `gtstef/filebrowser:1.3.3-stable` (no `:latest`). The web + FileBrowser sync path now **delegates** to `infra` so the pins can never diverge. +- **`stacks.Manager.EnsureBaseStack`** (`internal/stacks/infra.go`) — creates the `traefik-public` + network, then deploys traefik → cloudflared → filebrowser under `${stacks_dir}/`. **Single-flight** + (TryLock — it's fired from both first-boot and every health tick), **idempotent** (skips a stack whose + container is already running), **non-fatal** (logs, never crashes). cloudflared is deployed only when a + tunnel token is configured; filebrowser is not overwritten if its compose already exists (preserves the + storage mounts the web sync path manages). +- **Triggers** (`cmd/controller/main.go`): first-boot bring-up after stack init (goroutine, non-fatal); + self-heal calls `EnsureBaseStack` unconditionally on every `system-health` tick (decoupled from the + issue strings — safe because of the single-flight + idempotency). +- **Dynamic protected set** (`monitor.EffectiveProtected`): cloudflared counts as a protected container + only when a tunnel token is configured, so a LAN-only node doesn't report FAIL forever for a stack it + intentionally skips. Detection and the bring-up condition agree. +- **Section-G fix (in `felhom-agent` build-golden.sh):** the controller writes compose stacks under + `/opt/docker/stacks` inside its container, but the bootstrap `docker run` never bind-mounted that path, + so the guest daemon resolved every relative bind source on the guest filesystem (empty dirs) — breaking + **all** bind-mounted stacks (base infra + customer apps). Fixed with a same-path host bind + (`-v /opt/docker/stacks:/opt/docker/stacks`). Empirically confirmed on guest 9201 (probe printed + `cat: read error: Is a directory` before, `hello-from-controller` after). +- Tests: non-hollow `infra` render tests (customer params present, no `:latest` survives, both ACME/CF + branches render, `.env` 0600, rendered YAML parses), `EnsureBaseStack` single-flight, and + `EffectiveProtected`. + ### v0.40.0 — bootstrap pull+merge onboarding (controller pulls its config from the hub) (2026-06-11) Lockstep with `felhom-agent` v0.19.0. Fixes the onboarding 401: a freshly provisioned guest used to diff --git a/controller/cmd/controller/main.go b/controller/cmd/controller/main.go index 3b7c2f4..6ef69e6 100644 --- a/controller/cmd/controller/main.go +++ b/controller/cmd/controller/main.go @@ -153,6 +153,17 @@ func main() { // Migrate existing plaintext passwords to encrypted stackMgr.MigrateEncryption() + // --- First-boot base-infrastructure bring-up --- + // We are guaranteed configured here (setup.NeedsSetup returned false above), so deploy the base + // stack (traefik-public network → traefik → cloudflared → filebrowser) the controller needs for + // routing + external access. Runs in a goroutine so a slow first-boot image pull never delays the + // web server; non-fatal (idempotent + single-flight, the health loop re-attempts each tick). + go func() { + if err := stackMgr.EnsureBaseStack(); err != nil { + logger.Printf("[WARN] [infra] first-boot base-stack bring-up: %v", err) + } + }() + // --- Initialize catalog syncer --- syncer := catalogsync.New(cfg, logger, stackMgr.ScanStacks, func(updated []string) { stackMgr.InjectMissingFields(updated) @@ -258,6 +269,15 @@ func main() { } sched.Every("system-health", healthInterval, func(ctx context.Context) error { healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths(), logger) + // Self-heal the base stack: call unconditionally every tick. EnsureBaseStack is single-flight + // + idempotent (skips running stacks ⇒ a cheap 3× docker-inspect no-op when healthy), so there + // is no need to couple to the health-report issue strings. Runs in a goroutine — never blocks + // or fails the health job. + go func() { + if err := stackMgr.EnsureBaseStack(); err != nil { + logger.Printf("[WARN] [infra] self-heal base-stack bring-up: %v", err) + } + }() // Refresh dashboard alerts from health report updateAvailable := false latestVersion := "" diff --git a/controller/internal/infra/infra.go b/controller/internal/infra/infra.go new file mode 100644 index 0000000..eb1b535 --- /dev/null +++ b/controller/internal/infra/infra.go @@ -0,0 +1,221 @@ +// Package infra renders the base-infrastructure stacks (traefik, cloudflared, filebrowser) from the +// controller's config. It is PURE: templates in, file contents out — no docker, no filesystem, no IO. +// The orchestration (write the files, create the network, compose-up) lives in +// internal/stacks/infra.go (EnsureBaseStack), which owns the side effects. +// +// The templates are lifted verbatim from scripts/docker-setup.sh (the bare-metal installer, the +// historical source of truth for these stacks); bash `${VAR}` became Go template `{{.Field}}` and the +// heredoc conditionals became `{{if}}`. Image tags are PINNED here as the single source of truth — the +// web FileBrowser sync path (internal/web/handlers.go) delegates here so the pins can never diverge. +package infra + +import ( + "embed" + "fmt" + "path/filepath" + "strings" + "text/template" + + "gitea.dooplex.hu/admin/felhom-controller/internal/settings" +) + +// Pinned image tags — NEVER ":latest" (a floating tag breaks reproducible golden bakes and lets the +// deployed version drift). Verified to resolve on Docker Hub before baking. +const ( + TraefikImage = "traefik:v3.6.7" + CloudflaredImage = "cloudflare/cloudflared:2026.6.0" + FileBrowserImage = "gtstef/filebrowser:1.3.3-stable" +) + +//go:embed templates/*.tmpl +var templateFS embed.FS + +var tmpl = template.Must(template.New("infra").ParseFS(templateFS, "templates/*.tmpl")) + +// FileSpec is one rendered file: its content and the mode it must be written with. The mode matters — +// the traefik .env carries the Cloudflare API token (0600), the rest are world-readable config (0644). +type FileSpec struct { + Content string + Mode uint32 // os.FileMode bits (e.g. 0o600); uint32 keeps this package IO-free +} + +// TraefikData is the per-customer input for the traefik stack. ACMEEmail empty → no Let's Encrypt +// (traefik serves self-signed); CFAPIToken empty → HTTP-01 instead of Cloudflare DNS-01, and no .env. +type TraefikData struct { + ACMEEmail string + CFAPIToken string +} + +type traefikTmpl struct { + TraefikData + Image string +} + +// CloudflaredData is the per-customer input for the cloudflared stack (just the tunnel token). +type CloudflaredData struct { + CFTunnelToken string +} + +type cloudflaredTmpl struct { + CloudflaredData + Image string +} + +func render(name string, data any) (string, error) { + var b strings.Builder + if err := tmpl.ExecuteTemplate(&b, name, data); err != nil { + return "", fmt.Errorf("render %s: %w", name, err) + } + return b.String(), nil +} + +// RenderTraefik returns the traefik stack files: traefik.yml (static config), docker-compose.yml, and +// — only when a Cloudflare API token is set — a 0600 .env carrying CF_DNS_API_TOKEN (kept out of the +// compose file). The orchestrator additionally creates dynamic/, certs/ and an empty 0600 acme.json. +func RenderTraefik(d TraefikData) (map[string]FileSpec, error) { + td := traefikTmpl{TraefikData: d, Image: TraefikImage} + yml, err := render("traefik.yml.tmpl", td) + if err != nil { + return nil, err + } + compose, err := render("traefik-compose.yml.tmpl", td) + if err != nil { + return nil, err + } + files := map[string]FileSpec{ + "traefik.yml": {Content: yml, Mode: 0o644}, + "docker-compose.yml": {Content: compose, Mode: 0o644}, + } + if d.CFAPIToken != "" { + env := fmt.Sprintf("# Cloudflare API token for Let's Encrypt DNS-01 challenge (Zone:DNS:Edit).\n"+ + "# Managed by felhom-controller — do not edit.\nCF_DNS_API_TOKEN=%s\n", d.CFAPIToken) + files[".env"] = FileSpec{Content: env, Mode: 0o600} + } + return files, nil +} + +// RenderCloudflared returns the cloudflared stack files (compose only — no bind mounts; the tunnel +// token is the entire config). Caller deploys this only when a tunnel token is configured. +func RenderCloudflared(d CloudflaredData) (map[string]FileSpec, error) { + cd := cloudflaredTmpl{CloudflaredData: d, Image: CloudflaredImage} + compose, err := render("cloudflared-compose.yml.tmpl", cd) + if err != nil { + return nil, err + } + return map[string]FileSpec{ + "docker-compose.yml": {Content: compose, Mode: 0o644}, + }, nil +} + +// RenderFileBrowserCompose returns FileBrowser's docker-compose.yml for the given domain and storage +// volume-mount lines. Ported verbatim from internal/web/handlers.go (the single source of truth now +// lives here so the pinned image can't diverge between bring-up and the web storage-sync path). +func RenderFileBrowserCompose(domain string, storageMounts []string) string { + storageSection := "" + if len(storageMounts) > 0 { + storageSection = "\n # Storage paths (auto-generated by felhom-controller)\n" + + strings.Join(storageMounts, "\n") + } + + return fmt.Sprintf(`# FileBrowser Quantum — Infrastructure file manager +# Domain: files.%s +# Managed by felhom-controller. WARNING: Volume mounts are auto-generated; manual edits are overwritten. + +services: + filebrowser: + image: %s + container_name: filebrowser + restart: unless-stopped + environment: + - TZ=Europe/Budapest + - FILEBROWSER_CONFIG=/home/filebrowser/config.yaml + volumes: + - filebrowser_data:/home/filebrowser/data + - ./config.yaml:/home/filebrowser/config.yaml:ro%s + networks: + - traefik-public + deploy: + resources: + limits: + memory: 256M + healthcheck: + test: ["CMD", "wget", "--spider", "-q", "http://localhost:80/"] + interval: 30s + timeout: 5s + retries: 3 + start_period: 15s + labels: + - "traefik.enable=true" + - "traefik.http.routers.filebrowser.rule=Host(`+"`"+`files.%s`+"`"+`)" + - "traefik.http.routers.filebrowser.entrypoints=websecure" + - "traefik.http.routers.filebrowser.tls=true" + - "traefik.http.services.filebrowser.loadbalancer.server.port=80" + - "traefik.docker.network=traefik-public" + +volumes: + filebrowser_data: + +networks: + traefik-public: + external: true +`, domain, FileBrowserImage, storageSection, domain) +} + +// RenderFileBrowserConfig returns a FileBrowser Quantum config.yaml with one source per registered +// storage path (each a named sidebar entry). Empty paths → a single default /srv source. Ported +// verbatim from internal/web/handlers.go. +func RenderFileBrowserConfig(paths []settings.StoragePath) string { + var sources string + if len(paths) == 0 { + sources = ` - path: "/srv" +` + } else { + for _, sp := range paths { + mountName := filepath.Base(sp.Path) + label := sp.Label + if label == "" { + label = mountName + } + sources += fmt.Sprintf(" - path: \"/srv/%s\"\n name: %q\n config:\n defaultEnabled: true\n", mountName, label) + } + } + + return fmt.Sprintf(`# FileBrowser Quantum — managed by felhom-controller +# WARNING: This file is auto-generated. Manual edits will be overwritten. + +server: + port: 80 + baseURL: "/" + database: "/home/filebrowser/data/database.db" + logging: + - levels: "info|warning|error" + sources: +%suserDefaults: + stickySidebar: true + darkMode: true + viewMode: "normal" + showHidden: false + dateFormat: false + gallerySize: 3 + themeColor: "var(--blue)" + preview: + disableHideSidebar: false + highQuality: true + image: true + video: true + motionVideoPreview: true + office: true + popup: true + autoplayMedia: true + folder: true + permissions: + api: false + admin: false + modify: false + share: false + realtime: false + delete: false + create: false + download: true +`, sources) +} diff --git a/controller/internal/infra/infra_test.go b/controller/internal/infra/infra_test.go new file mode 100644 index 0000000..cdd2c79 --- /dev/null +++ b/controller/internal/infra/infra_test.go @@ -0,0 +1,191 @@ +package infra + +import ( + "strings" + "testing" + + "gitea.dooplex.hu/admin/felhom-controller/internal/settings" + "gopkg.in/yaml.v3" +) + +// TestRenderedYAMLParses guards against template-whitespace indentation bugs: every rendered +// compose/config/static-config must be well-formed YAML across the token matrix. +func TestRenderedYAMLParses(t *testing.T) { + for i, s := range allRendered(t) { + var v any + if err := yaml.Unmarshal([]byte(s), &v); err != nil { + t.Fatalf("rendered output #%d is not valid YAML: %v\n---\n%s", i, err, s) + } + } +} + +// allComposeStrings renders every compose/config we emit, across the token/token-less matrix, so a +// single ":latest" anywhere is caught. +func allRendered(t *testing.T) []string { + t.Helper() + var out []string + for _, td := range []TraefikData{ + {ACMEEmail: "admin@example.com", CFAPIToken: "cf-api-tok"}, + {ACMEEmail: "admin@example.com"}, // email, no CF token → HTTP-01 + {}, // token-less / LAN-only + } { + files, err := RenderTraefik(td) + if err != nil { + t.Fatalf("RenderTraefik(%+v): %v", td, err) + } + for _, f := range files { + out = append(out, f.Content) + } + } + cf, err := RenderCloudflared(CloudflaredData{CFTunnelToken: "tunnel-tok"}) + if err != nil { + t.Fatalf("RenderCloudflared: %v", err) + } + for _, f := range cf { + out = append(out, f.Content) + } + out = append(out, RenderFileBrowserCompose("example.com", nil)) + out = append(out, RenderFileBrowserCompose("example.com", []string{" - /mnt/hdd_1:/srv/hdd_1"})) + out = append(out, RenderFileBrowserConfig(nil)) + return out +} + +func TestNoLatestTagSurvives(t *testing.T) { + for _, c := range []string{TraefikImage, CloudflaredImage, FileBrowserImage} { + if strings.HasSuffix(c, ":latest") || !strings.Contains(c, ":") { + t.Fatalf("image constant is not pinned: %q", c) + } + } + for _, s := range allRendered(t) { + if strings.Contains(s, ":latest") { + t.Fatalf(":latest survived in rendered output:\n%s", s) + } + } +} + +func TestTraefikWithCloudflareToken(t *testing.T) { + files, err := RenderTraefik(TraefikData{ACMEEmail: "admin@example.com", CFAPIToken: "cf-api-tok"}) + if err != nil { + t.Fatal(err) + } + yml := files["traefik.yml"].Content + if !strings.Contains(yml, "certResolver: letsencrypt") { + t.Error("expected certResolver on websecure when ACME email set") + } + if !strings.Contains(yml, "dnsChallenge") || !strings.Contains(yml, "provider: cloudflare") { + t.Error("expected Cloudflare DNS-01 challenge when CF API token set") + } + if strings.Contains(yml, "httpChallenge") { + t.Error("HTTP-01 must NOT appear when a CF API token is set") + } + if !strings.Contains(yml, "email: admin@example.com") { + t.Error("ACME email must appear in the cert-resolver block") + } + + compose := files["docker-compose.yml"].Content + if !strings.Contains(compose, "env_file") { + t.Error("expected env_file in traefik compose when CF API token set") + } + if !strings.Contains(compose, TraefikImage) { + t.Errorf("expected pinned traefik image %q in compose", TraefikImage) + } + + env, ok := files[".env"] + if !ok { + t.Fatal("expected a .env file when CF API token is set") + } + if !strings.Contains(env.Content, "CF_DNS_API_TOKEN=cf-api-tok") { + t.Errorf("CF API token not wired into .env: %q", env.Content) + } + if env.Mode != 0o600 { + t.Errorf(".env must be 0600 (carries the CF token), got %o", env.Mode) + } + if files["traefik.yml"].Mode != 0o644 || files["docker-compose.yml"].Mode != 0o644 { + t.Error("traefik.yml and docker-compose.yml must be 0644") + } +} + +func TestTraefikEmailNoCloudflareToken(t *testing.T) { + files, err := RenderTraefik(TraefikData{ACMEEmail: "admin@example.com"}) + if err != nil { + t.Fatal(err) + } + yml := files["traefik.yml"].Content + if !strings.Contains(yml, "httpChallenge") { + t.Error("expected HTTP-01 challenge when email set but no CF token") + } + if strings.Contains(yml, "dnsChallenge") { + t.Error("DNS-01 must NOT appear without a CF token") + } + if _, ok := files[".env"]; ok { + t.Error("no .env should be emitted without a CF API token") + } +} + +func TestTraefikTokenless(t *testing.T) { + files, err := RenderTraefik(TraefikData{}) + if err != nil { + t.Fatal(err) + } + yml := files["traefik.yml"].Content + if strings.Contains(yml, "certificatesResolvers") || strings.Contains(yml, "certResolver") { + t.Error("token-less node must emit no cert resolver (traefik serves self-signed)") + } + compose := files["docker-compose.yml"].Content + if strings.Contains(compose, "env_file") { + t.Error("token-less compose must not reference env_file") + } + if _, ok := files[".env"]; ok { + t.Error("token-less node must emit no .env") + } + // Structural difference vs the with-token case is the whole point: the resolver section is absent. + withTok, _ := RenderTraefik(TraefikData{ACMEEmail: "admin@example.com", CFAPIToken: "x"}) + if withTok["traefik.yml"].Content == yml { + t.Error("token-less and with-token traefik.yml must differ structurally") + } +} + +func TestCloudflaredRender(t *testing.T) { + files, err := RenderCloudflared(CloudflaredData{CFTunnelToken: "tunnel-tok-123"}) + if err != nil { + t.Fatal(err) + } + compose := files["docker-compose.yml"].Content + if !strings.Contains(compose, "TUNNEL_TOKEN=tunnel-tok-123") { + t.Errorf("tunnel token not wired into cloudflared env: %q", compose) + } + if !strings.Contains(compose, CloudflaredImage) { + t.Errorf("expected pinned cloudflared image %q", CloudflaredImage) + } + if !strings.Contains(compose, "command: tunnel run") { + t.Error("expected `command: tunnel run`") + } +} + +func TestFileBrowserRender(t *testing.T) { + compose := RenderFileBrowserCompose("demo-felhom.eu", nil) + if !strings.Contains(compose, "Host(`files.demo-felhom.eu`)") { + t.Errorf("domain not wired into filebrowser routing label: %q", compose) + } + if !strings.Contains(compose, FileBrowserImage) { + t.Errorf("expected pinned filebrowser image %q", FileBrowserImage) + } + + // Default config (no storage paths) → a single /srv source. + def := RenderFileBrowserConfig(nil) + if !strings.Contains(def, `- path: "/srv"`) { + t.Errorf("empty config must default to a /srv source: %q", def) + } + + // With paths → a named per-drive source. + withPaths := RenderFileBrowserConfig([]settings.StoragePath{{Path: "/mnt/hdd_1", Label: "Media"}}) + if !strings.Contains(withPaths, `- path: "/srv/hdd_1"`) || !strings.Contains(withPaths, `name: "Media"`) { + t.Errorf("storage path not wired into filebrowser config: %q", withPaths) + } + + // Storage mounts wire into the compose volumes section. + withMounts := RenderFileBrowserCompose("demo-felhom.eu", []string{" - /mnt/hdd_1:/srv/hdd_1"}) + if !strings.Contains(withMounts, "/mnt/hdd_1:/srv/hdd_1") { + t.Errorf("storage mount not wired into filebrowser compose: %q", withMounts) + } +} diff --git a/controller/internal/infra/templates/cloudflared-compose.yml.tmpl b/controller/internal/infra/templates/cloudflared-compose.yml.tmpl new file mode 100644 index 0000000..a289659 --- /dev/null +++ b/controller/internal/infra/templates/cloudflared-compose.yml.tmpl @@ -0,0 +1,22 @@ +# Cloudflare Tunnel — external access connector — managed by felhom-controller (base-infra bring-up). +# Routes are configured in the Cloudflare dashboard (Zero Trust > Networks > Tunnels > Public Hostname); +# the tunnel connects Cloudflare's edge to Traefik, which handles TLS + routing internally. +services: + cloudflared: + image: {{.Image}} + container_name: cloudflared + restart: unless-stopped + command: tunnel run + environment: + - TUNNEL_TOKEN={{.CFTunnelToken}} + dns: + - 1.1.1.1 + - 8.8.8.8 + security_opt: + - no-new-privileges:true + networks: + - traefik-public + +networks: + traefik-public: + external: true diff --git a/controller/internal/infra/templates/traefik-compose.yml.tmpl b/controller/internal/infra/templates/traefik-compose.yml.tmpl new file mode 100644 index 0000000..0509993 --- /dev/null +++ b/controller/internal/infra/templates/traefik-compose.yml.tmpl @@ -0,0 +1,30 @@ +# Traefik Reverse Proxy — managed by felhom-controller (base-infra bring-up). +services: + traefik: + image: {{.Image}} + container_name: traefik + restart: unless-stopped + dns: + - 1.1.1.1 + - 8.8.8.8 + security_opt: + - no-new-privileges:true + ports: + - "80:80" + - "443:443" +{{- if .CFAPIToken}} + env_file: + - .env +{{- end}} + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro + - ./traefik.yml:/etc/traefik/traefik.yml:ro + - ./dynamic:/etc/traefik/dynamic:ro + - ./acme.json:/etc/traefik/acme.json + - ./certs:/etc/traefik/certs:ro + networks: + - traefik-public + +networks: + traefik-public: + external: true diff --git a/controller/internal/infra/templates/traefik.yml.tmpl b/controller/internal/infra/templates/traefik.yml.tmpl new file mode 100644 index 0000000..fd00199 --- /dev/null +++ b/controller/internal/infra/templates/traefik.yml.tmpl @@ -0,0 +1,54 @@ +# Traefik Static Configuration +# Generated by felhom-controller (base-infra bring-up). Do not edit — regenerated on bring-up. + +api: + dashboard: true + insecure: false + +entryPoints: + web: + address: ":80" + http: + redirections: + entryPoint: + to: websecure + scheme: https + websecure: + address: ":443" +{{- if .ACMEEmail}} + http: + tls: + certResolver: letsencrypt +{{- end}} + +providers: + docker: + endpoint: "unix:///var/run/docker.sock" + exposedByDefault: false + network: traefik-public + file: + directory: /etc/traefik/dynamic + watch: true + +log: + level: INFO + +accessLog: {} +{{- if .ACMEEmail}} + +certificatesResolvers: + letsencrypt: + acme: + email: {{.ACMEEmail}} + storage: /etc/traefik/acme.json +{{- if .CFAPIToken}} + dnsChallenge: + provider: cloudflare + resolvers: + - "1.1.1.1:53" + - "8.8.8.8:53" +{{- else}} + httpChallenge: + entryPoint: web +{{- end}} +{{- end}} diff --git a/controller/internal/monitor/effective_protected_test.go b/controller/internal/monitor/effective_protected_test.go new file mode 100644 index 0000000..b0788e2 --- /dev/null +++ b/controller/internal/monitor/effective_protected_test.go @@ -0,0 +1,39 @@ +package monitor + +import ( + "testing" + + "gitea.dooplex.hu/admin/felhom-controller/internal/config" +) + +func contains(ss []string, want string) bool { + for _, s := range ss { + if s == want { + return true + } + } + return false +} + +// EffectiveProtected must drop cloudflared when no tunnel token is configured (LAN-only node), so the +// health loop doesn't report it missing forever — but keep it when a token IS configured. +func TestEffectiveProtectedDropsCloudflaredWithoutToken(t *testing.T) { + base := config.StacksConfig{Protected: []string{"traefik", "cloudflared", "felhom-controller", "filebrowser"}} + + cfgNoTok := &config.Config{Stacks: base} + got := EffectiveProtected(cfgNoTok) + if contains(got, "cloudflared") { + t.Errorf("cloudflared must be dropped when no tunnel token: %v", got) + } + for _, must := range []string{"traefik", "felhom-controller", "filebrowser"} { + if !contains(got, must) { + t.Errorf("%s must remain protected: %v", must, got) + } + } + + cfgTok := &config.Config{Stacks: base} + cfgTok.Infrastructure.CFTunnelToken = "tok" + if !contains(EffectiveProtected(cfgTok), "cloudflared") { + t.Error("cloudflared must remain protected when a tunnel token is configured") + } +} diff --git a/controller/internal/monitor/healthcheck.go b/controller/internal/monitor/healthcheck.go index ea3e136..eeaba76 100644 --- a/controller/internal/monitor/healthcheck.go +++ b/controller/internal/monitor/healthcheck.go @@ -152,11 +152,13 @@ func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, stora } } - // 6. Protected containers + // 6. Protected containers (effective set: cloudflared only counts when a tunnel token is + // configured, so a LAN-only node doesn't report FAIL forever for a stack it intentionally skips). + protected := EffectiveProtected(cfg) if debug { - logger.Printf("[DEBUG] [monitor] Checking %d protected containers: %v", len(cfg.Stacks.Protected), cfg.Stacks.Protected) + logger.Printf("[DEBUG] [monitor] Checking %d protected containers: %v", len(protected), protected) } - missingProtected := checkProtectedContainers(cfg.Stacks.Protected) + missingProtected := checkProtectedContainers(protected) for _, name := range missingProtected { report.Issues = append(report.Issues, fmt.Sprintf("Protected container not running: %s", name)) } @@ -237,6 +239,22 @@ func checkDocker() error { return nil } +// EffectiveProtected returns the protected-container set that actually applies to this node. It is +// the configured cfg.Stacks.Protected minus stacks that are intentionally not deployed here: +// cloudflared is dropped when no tunnel token is configured (a LAN-only node legitimately runs +// without it, so it must not be reported as a missing protected container forever). The bring-up +// (stacks.EnsureBaseStack) applies the same cloudflared condition, so detection and deployment agree. +func EffectiveProtected(cfg *config.Config) []string { + out := make([]string, 0, len(cfg.Stacks.Protected)) + for _, name := range cfg.Stacks.Protected { + if name == "cloudflared" && cfg.Infrastructure.CFTunnelToken == "" { + continue + } + out = append(out, name) + } + return out +} + func checkProtectedContainers(protected []string) []string { var missing []string for _, name := range protected { diff --git a/controller/internal/stacks/infra.go b/controller/internal/stacks/infra.go new file mode 100644 index 0000000..493d570 --- /dev/null +++ b/controller/internal/stacks/infra.go @@ -0,0 +1,193 @@ +package stacks + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + + "gitea.dooplex.hu/admin/felhom-controller/internal/infra" +) + +const traefikNetwork = "traefik-public" + +// EnsureBaseStack renders and deploys the base-infrastructure stacks (traefik, cloudflared, +// filebrowser) the controller needs for routing + external access. It is: +// - single-flight: fired both at first boot and on every health tick; a TryLock ensures two +// invocations never race on the same stack dir / run concurrent `compose up` on the same stack. +// - idempotent: each stack is skipped when its container is already running, so the healthy-state +// re-run is a cheap 3× `docker inspect` no-op. +// - non-fatal by contract: returns a joined error for the caller to LOG; it must never crash the +// controller. cloudflared is only deployed when a tunnel token is configured (LAN-only nodes +// legitimately run without it — see the dynamic protected set in monitor.EffectiveProtected). +// +// Deploy order is load-bearing: traefik-public network → traefik → cloudflared → filebrowser (the +// composes declare the network `external: true`, so it must exist first). +func (m *Manager) EnsureBaseStack() error { + if !m.infraMu.TryLock() { + m.logger.Printf("[INFO] [infra] EnsureBaseStack already in progress — skipping this invocation") + return nil + } + defer m.infraMu.Unlock() + + if err := m.ensureTraefikNetwork(); err != nil { + return fmt.Errorf("base-infra: %w", err) // without the network, every stack `up` fails + } + + base := m.cfg.Paths.StacksDir + var errs []string + + if err := m.ensureTraefik(filepath.Join(base, "traefik")); err != nil { + errs = append(errs, fmt.Sprintf("traefik: %v", err)) + } + + if m.cfg.Infrastructure.CFTunnelToken != "" { + if err := m.ensureCloudflared(filepath.Join(base, "cloudflared")); err != nil { + errs = append(errs, fmt.Sprintf("cloudflared: %v", err)) + } + } else { + m.logger.Printf("[INFO] [infra] cloudflared skipped — no cf_tunnel_token configured (LAN-only node)") + } + + if err := m.ensureFileBrowser(filepath.Join(base, "filebrowser")); err != nil { + errs = append(errs, fmt.Sprintf("filebrowser: %v", err)) + } + + if len(errs) > 0 { + return fmt.Errorf("base-infra bring-up: %s", strings.Join(errs, "; ")) + } + return nil +} + +func (m *Manager) ensureTraefik(dir string) error { + if containerRunning("traefik") { + return nil + } + m.logger.Printf("[INFO] [infra] deploying traefik → %s", dir) + if err := os.MkdirAll(filepath.Join(dir, "dynamic"), 0o755); err != nil { + return fmt.Errorf("mkdir dynamic: %w", err) + } + if err := os.MkdirAll(filepath.Join(dir, "certs"), 0o755); err != nil { + return fmt.Errorf("mkdir certs: %w", err) + } + // acme.json must exist as a 0600 file before traefik starts (it writes issued certs into it). + acme := filepath.Join(dir, "acme.json") + if _, err := os.Stat(acme); os.IsNotExist(err) { + if err := os.WriteFile(acme, []byte{}, 0o600); err != nil { + return fmt.Errorf("create acme.json: %w", err) + } + } + if err := os.Chmod(acme, 0o600); err != nil { + return fmt.Errorf("chmod acme.json: %w", err) + } + files, err := infra.RenderTraefik(infra.TraefikData{ + ACMEEmail: m.cfg.Customer.Email, + CFAPIToken: m.cfg.Infrastructure.CFAPIToken, + }) + if err != nil { + return err + } + if err := writeInfraFiles(dir, files); err != nil { + return err + } + return m.composeUp(dir) +} + +func (m *Manager) ensureCloudflared(dir string) error { + if containerRunning("cloudflared") { + return nil + } + m.logger.Printf("[INFO] [infra] deploying cloudflared → %s", dir) + files, err := infra.RenderCloudflared(infra.CloudflaredData{CFTunnelToken: m.cfg.Infrastructure.CFTunnelToken}) + if err != nil { + return err + } + if err := writeInfraFiles(dir, files); err != nil { + return err + } + return m.composeUp(dir) +} + +func (m *Manager) ensureFileBrowser(dir string) error { + if containerRunning("filebrowser") { + return nil + } + composePath := filepath.Join(dir, "docker-compose.yml") + if _, err := os.Stat(composePath); err == nil { + // Already provisioned but not running — bring it up WITHOUT regenerating, so the storage + // mounts that web.SyncFileBrowserMounts manages are preserved. Just `compose up -d`. + m.logger.Printf("[INFO] [infra] filebrowser compose exists — starting without regenerating") + return m.composeUp(dir) + } + m.logger.Printf("[INFO] [infra] deploying filebrowser → %s", dir) + if err := os.MkdirAll(dir, 0o755); err != nil { + return fmt.Errorf("mkdir: %w", err) + } + // Initial render: no storage mounts yet (web.SyncFileBrowserMounts fills them in on the first + // storage-path change and owns all later regeneration). + compose := infra.RenderFileBrowserCompose(m.cfg.Customer.Domain, nil) + config := infra.RenderFileBrowserConfig(nil) + if err := os.WriteFile(composePath, []byte(compose), 0o644); err != nil { + return fmt.Errorf("write docker-compose.yml: %w", err) + } + if err := os.WriteFile(filepath.Join(dir, "config.yaml"), []byte(config), 0o644); err != nil { + return fmt.Errorf("write config.yaml: %w", err) + } + return m.composeUp(dir) +} + +// ensureTraefikNetwork creates the external traefik-public docker network if absent (idempotent; +// tolerates a create/inspect race). Uses the docker CLI directly — it's a network op, not compose. +func (m *Manager) ensureTraefikNetwork() error { + if exec.Command("docker", "network", "inspect", traefikNetwork).Run() == nil { + return nil + } + m.logger.Printf("[INFO] [infra] creating docker network %s", traefikNetwork) + out, err := exec.Command("docker", "network", "create", traefikNetwork).CombinedOutput() + if err != nil { + // Tolerate a race where another actor created it between our inspect and create. + if exec.Command("docker", "network", "inspect", traefikNetwork).Run() == nil { + return nil + } + return fmt.Errorf("network create %s: %s: %w", traefikNetwork, strings.TrimSpace(string(out)), err) + } + return nil +} + +// composeUp runs `docker compose up -d` in dir (DOMAIN injected by composeExecWithEnv). +func (m *Manager) composeUp(dir string) error { + out, err := m.composeExecWithEnv(dir, nil, "up", "-d") + if err != nil { + return fmt.Errorf("compose up: %s: %w", truncateStr(strings.TrimSpace(out), 300), err) + } + return nil +} + +// writeInfraFiles writes each rendered file at its required mode (enforced via Chmod so an existing +// file — e.g. a re-rendered .env — keeps 0600 even though WriteFile only honors mode on create). +func writeInfraFiles(dir string, files map[string]infra.FileSpec) error { + if err := os.MkdirAll(dir, 0o755); err != nil { + return fmt.Errorf("mkdir: %w", err) + } + for name, spec := range files { + p := filepath.Join(dir, name) + if err := os.WriteFile(p, []byte(spec.Content), os.FileMode(spec.Mode)); err != nil { + return fmt.Errorf("write %s: %w", name, err) + } + if err := os.Chmod(p, os.FileMode(spec.Mode)); err != nil { + return fmt.Errorf("chmod %s: %w", name, err) + } + } + return nil +} + +// containerRunning reports whether a container with the given name is currently running. It asks the +// daemon directly (works before the stack dir exists), mirroring monitor.checkProtectedContainers. +func containerRunning(name string) bool { + out, err := exec.Command("docker", "inspect", "--format", "{{.State.Running}}", name).Output() + if err != nil { + return false + } + return strings.TrimSpace(string(out)) == "true" +} diff --git a/controller/internal/stacks/infra_test.go b/controller/internal/stacks/infra_test.go new file mode 100644 index 0000000..a25faa2 --- /dev/null +++ b/controller/internal/stacks/infra_test.go @@ -0,0 +1,36 @@ +package stacks + +import ( + "io" + "log" + "testing" + "time" + + "gitea.dooplex.hu/admin/felhom-controller/internal/config" +) + +// TestEnsureBaseStackSingleFlight proves the single-flight guard short-circuits: when infraMu is +// already held, EnsureBaseStack returns immediately (nil) WITHOUT touching docker. We hold the lock +// in this goroutine and call EnsureBaseStack in the same goroutine — Go mutexes are non-reentrant, so +// TryLock fails and the function returns before any docker network/inspect call. If the guard were +// missing, the call would shell out to docker (unavailable in unit tests) and not return nil cleanly. +func TestEnsureBaseStackSingleFlight(t *testing.T) { + m := &Manager{ + cfg: &config.Config{}, + logger: log.New(io.Discard, "", 0), + } + m.infraMu.Lock() + defer m.infraMu.Unlock() + + done := make(chan error, 1) + go func() { done <- m.EnsureBaseStack() }() + + select { + case err := <-done: + if err != nil { + t.Fatalf("expected nil (single-flight no-op) while lock held, got %v", err) + } + case <-time.After(3 * time.Second): + t.Fatal("EnsureBaseStack did not short-circuit while infraMu was held (single-flight guard missing?)") + } +} diff --git a/controller/internal/stacks/manager.go b/controller/internal/stacks/manager.go index 1b7fd8e..087922a 100644 --- a/controller/internal/stacks/manager.go +++ b/controller/internal/stacks/manager.go @@ -83,7 +83,8 @@ type Manager struct { composeCmd string stacks map[string]*Stack mu sync.RWMutex - encKey []byte // AES-256 key for encrypting sensitive values in app.yaml + encKey []byte // AES-256 key for encrypting sensitive values in app.yaml + infraMu sync.Mutex // single-flight guard for EnsureBaseStack (base-infra bring-up/self-heal) } // NewManager creates a new stack manager. diff --git a/controller/internal/web/handlers.go b/controller/internal/web/handlers.go index e161f2f..af221bf 100644 --- a/controller/internal/web/handlers.go +++ b/controller/internal/web/handlers.go @@ -13,6 +13,7 @@ import ( "gitea.dooplex.hu/admin/felhom-controller/internal/backup" "gitea.dooplex.hu/admin/felhom-controller/internal/crypto" + "gitea.dooplex.hu/admin/felhom-controller/internal/infra" "gitea.dooplex.hu/admin/felhom-controller/internal/scheduler" "gitea.dooplex.hu/admin/felhom-controller/internal/settings" "gitea.dooplex.hu/admin/felhom-controller/internal/stacks" @@ -1378,115 +1379,15 @@ func (s *Server) syncFileBrowserMounts(resetDBOnChange bool) { } } -// generateFileBrowserCompose returns a FileBrowser docker-compose.yml string -// with the given domain and storage volume mount lines. +// generateFileBrowserCompose returns a FileBrowser docker-compose.yml string with the given domain +// and storage volume-mount lines. Delegates to internal/infra (the single source of truth — so the +// pinned image and the base-infra bring-up path can never diverge). func generateFileBrowserCompose(domain string, storageMounts []string) string { - storageSection := "" - if len(storageMounts) > 0 { - storageSection = "\n # Storage paths (auto-generated by felhom-controller)\n" + - strings.Join(storageMounts, "\n") - } - - return fmt.Sprintf(`# FileBrowser Quantum — Infrastructure file manager -# Domain: files.%s -# Deployed by docker-setup.sh — managed by felhom-controller -# WARNING: Volume mounts are auto-generated. Manual edits will be overwritten. - -services: - filebrowser: - image: gtstef/filebrowser:latest - container_name: filebrowser - restart: unless-stopped - environment: - - TZ=Europe/Budapest - - FILEBROWSER_CONFIG=/home/filebrowser/config.yaml - volumes: - - filebrowser_data:/home/filebrowser/data - - ./config.yaml:/home/filebrowser/config.yaml:ro%s - networks: - - traefik-public - deploy: - resources: - limits: - memory: 256M - healthcheck: - test: ["CMD", "wget", "--spider", "-q", "http://localhost:80/"] - interval: 30s - timeout: 5s - retries: 3 - start_period: 15s - labels: - - "traefik.enable=true" - - "traefik.http.routers.filebrowser.rule=Host(`+"`"+`files.%s`+"`"+`)" - - "traefik.http.routers.filebrowser.entrypoints=websecure" - - "traefik.http.routers.filebrowser.tls=true" - - "traefik.http.services.filebrowser.loadbalancer.server.port=80" - - "traefik.docker.network=traefik-public" - -volumes: - filebrowser_data: - -networks: - traefik-public: - external: true -`, domain, storageSection, domain) + return infra.RenderFileBrowserCompose(domain, storageMounts) } -// generateFileBrowserConfig returns a FileBrowser Quantum config.yaml with -// a separate source per registered storage path. Each source appears as a -// named sidebar entry in the FileBrowser UI. +// generateFileBrowserConfig returns a FileBrowser Quantum config.yaml with a separate source per +// registered storage path. Delegates to internal/infra (single source of truth). func generateFileBrowserConfig(paths []settings.StoragePath) string { - var sources string - if len(paths) == 0 { - sources = ` - path: "/srv" -` - } else { - for _, sp := range paths { - mountName := filepath.Base(sp.Path) - label := sp.Label - if label == "" { - label = mountName - } - sources += fmt.Sprintf(" - path: \"/srv/%s\"\n name: %q\n config:\n defaultEnabled: true\n", mountName, label) - } - } - - return fmt.Sprintf(`# FileBrowser Quantum — managed by felhom-controller -# WARNING: This file is auto-generated. Manual edits will be overwritten. - -server: - port: 80 - baseURL: "/" - database: "/home/filebrowser/data/database.db" - logging: - - levels: "info|warning|error" - sources: -%suserDefaults: - stickySidebar: true - darkMode: true - viewMode: "normal" - showHidden: false - dateFormat: false - gallerySize: 3 - themeColor: "var(--blue)" - preview: - disableHideSidebar: false - highQuality: true - image: true - video: true - motionVideoPreview: true - office: true - popup: true - autoplayMedia: true - folder: true - permissions: - api: false - admin: false - modify: false - share: false - realtime: false - delete: false - create: false - download: true -`, sources) + return infra.RenderFileBrowserConfig(paths) }