// Package bootstrap implements first-run bootstrap.json ingestion (slice 8A → v0.40.0 onboarding, // doc 03 §6, config-contract decision (c)/(d)). The host agent's provisioning back-half writes a // stable bootstrap.json into a read-only config mount carrying ONLY the customer id, the hub URL, a // per-customer RETRIEVAL PASSPHRASE, and the per-guest local-API handle. On first run the controller // uses the passphrase to PULL its full controller.yaml from the hub (which mints the customer-scoped // hub api_key + identity + assets + backup + CF config), MERGES in the per-guest local_api block (the // only thing the hub yaml lacks, because the hub must not know per-guest Proxmox internals), writes // it, and comes up CONFIGURED — skipping the setup wizard. // // This replaces the old "seed a configured yaml from the agent's HOST key" path, which made the // controller's hub reports 401 (the hub's /report needs the customer-scoped key, not the host key). package bootstrap import ( "encoding/json" "errors" "fmt" "log" "os" "path/filepath" "strings" "time" "gitea.dooplex.hu/admin/felhom-controller/internal/config" "gopkg.in/yaml.v3" ) // DefaultMountPath is where the agent attaches the read-only config mount (spike S2). Override // with FELHOM_BOOTSTRAP_PATH for tests / non-standard layouts. const DefaultMountPath = "/etc/felhom-bootstrap/bootstrap.json" // SchemaV2 is the stable contract version the agent emits and the controller ingests. v2 changed the // contract's MEANING (the controller pulls its config from the hub rather than seeding it from the // agent's host key) and its field set, so it is a clean version bump with no v1 back-compat // (pre-launch; zero v1 guests deployed). A non-v2 schema is rejected → setup mode. const SchemaV2 = "felhom.bootstrap/v2" // ErrPullTransient marks a pull failure as retryable (a boot-time network race reaching the hub). // The wiring (main.go) wraps report.ErrHubUnreachable with this; permanent failures (auth / // not-found) are NOT wrapped, so MaybeIngest fails fast on them. Keeping this sentinel here (rather // than importing the heavy internal/report package) keeps bootstrap decoupled — decision (b). var ErrPullTransient = errors.New("bootstrap: transient pull failure") // pullRetryDelays is the backoff between transient pull retries (one initial attempt + one retry per // entry → 4 attempts total on persistent transient failure). Overridable in tests for speed. var pullRetryDelays = []time.Duration{2 * time.Second, 4 * time.Second, 8 * time.Second} // PullFunc fetches a generated controller.yaml from the hub for a customer, authenticated by the // retrieval passphrase. Injected (decision (b)) so MaybeIngest never imports internal/report; the // production wiring passes report.PullConfig. A transient (retryable) failure must be // errors.Is(err, ErrPullTransient); any other error is treated as permanent (no retry). type PullFunc func(hubURL, customerID, retrievalPassword string) (string, error) // Bootstrap is the stable agent→controller config contract (JSON, schema v2). It carries ONLY what // the controller needs to PULL its config (customer id + hub url + retrieval passphrase) and reach // the agent's local API (endpoint/fingerprint/token). It is deliberately a SEPARATE shape from // controller.yaml: the agent never needs to know the controller's full config schema, and never // holds the customer-scoped hub key or CF tokens (those come from the hub pull). type Bootstrap struct { Schema string `json:"schema"` Customer BootstrapCustomer `json:"customer"` // only id (the pull target); the hub provides name/domain/email Hub BootstrapHub `json:"hub"` LocalAPI BootstrapLocalAPI `json:"local_api"` } type BootstrapCustomer struct { ID string `json:"id"` } type BootstrapHub struct { URL string `json:"url"` RetrievalPassword string `json:"retrieval_password"` // SECRET — pulls the full config (incl. the customer key) } type BootstrapLocalAPI struct { Endpoint string `json:"endpoint"` // host bridge IP:port Fingerprint string `json:"fingerprint"` // agent leaf-cert SHA-256 (hex) to pin Token string `json:"token"` // per-guest bearer; SECRET } // Path returns the bootstrap mount path (env override → default). func Path() string { if p := strings.TrimSpace(os.Getenv("FELHOM_BOOTSTRAP_PATH")); p != "" { return p } return DefaultMountPath } // MaybeIngest, on an unconfigured controller, pulls the full controller.yaml from the hub (using the // bootstrap's retrieval passphrase), merges in the per-guest local_api block, writes controller.yaml, // and returns the reloaded config. Returns the config the caller should use. // // Contract: // - Idempotent: if cfg is already configured (customer.id set), the existing controller.yaml is // NEVER clobbered and the hub is NEVER pulled — returns cfg unchanged. // - Fail-safe: an absent/malformed bootstrap, a non-v2 schema, a missing required field, or a hub // pull that ultimately fails leaves cfg unchanged (the caller proceeds to setup mode). It logs // and NEVER crashes — a hub outage at first boot must not brick the guest. // - On success: writes controller.yaml (0600, atomic), reloads it, and returns the reloaded cfg. func MaybeIngest(configPath string, cfg *config.Config, logger *log.Logger, pull PullFunc) *config.Config { if cfg != nil && cfg.Customer.ID != "" { return cfg // already configured — do not clobber, do not pull (idempotent) } bpath := Path() data, err := os.ReadFile(bpath) if err != nil { if !os.IsNotExist(err) { logger.Printf("[WARN] bootstrap: cannot read %s: %v — staying in setup", bpath, err) } return cfg // no bootstrap → normal setup } var b Bootstrap if err := json.Unmarshal(data, &b); err != nil { logger.Printf("[WARN] bootstrap: %s is not valid JSON: %v — staying in setup", bpath, err) return cfg } if b.Schema != SchemaV2 { logger.Printf("[WARN] bootstrap: unsupported schema %q (want %q) — staying in setup", b.Schema, SchemaV2) return cfg } if b.Customer.ID == "" || b.Hub.URL == "" || b.Hub.RetrievalPassword == "" { logger.Printf("[WARN] bootstrap: %s missing customer.id / hub.url / hub.retrieval_password — staying in setup", bpath) return cfg } if b.LocalAPI.Endpoint == "" || b.LocalAPI.Fingerprint == "" || b.LocalAPI.Token == "" { logger.Printf("[WARN] bootstrap: %s missing local_api.{endpoint,fingerprint,token} — staying in setup", bpath) return cfg } if pull == nil { logger.Printf("[WARN] bootstrap: no pull function wired — staying in setup") return cfg } // --- Pull the full controller.yaml from the hub, with bounded retry on transient errors only. --- pulled, err := pullWithRetry(pull, b.Hub.URL, b.Customer.ID, b.Hub.RetrievalPassword, logger) if err != nil { logger.Printf("[WARN] bootstrap: hub config pull failed for customer %s from %s: %v — staying in setup (manual setup wizard remains the fallback)", b.Customer.ID, b.Hub.URL, err) return cfg } // --- Merge the per-guest local_api block into the hub yaml at the MAP level (decision (c)) so // every field the hub emits is preserved (forward-compat with hub template changes). --- merged, err := mergeLocalAPI(pulled, b.LocalAPI) if err != nil { logger.Printf("[WARN] bootstrap: merging local_api into pulled config failed: %v — staying in setup", err) return cfg } if err := writeFileAtomic(configPath, merged); err != nil { logger.Printf("[WARN] bootstrap: could not write %s: %v — staying in setup", configPath, err) return cfg } reloaded, err := config.LoadPermissive(configPath) if err != nil { logger.Printf("[WARN] bootstrap: wrote %s but reload failed: %v — staying in setup", configPath, err) return cfg } logger.Printf("[INFO] bootstrap: pulled config from %s for %s, merged local_api (%s) — coming up configured", b.Hub.URL, b.Customer.ID, b.LocalAPI.Endpoint) return reloaded } // pullWithRetry calls pull once, then retries on transient (ErrPullTransient) failures only, with // the pullRetryDelays backoff. Permanent failures (anything not ErrPullTransient) fail fast. func pullWithRetry(pull PullFunc, hubURL, customerID, password string, logger *log.Logger) (string, error) { var lastErr error for attempt := 0; ; attempt++ { yaml, err := pull(hubURL, customerID, password) if err == nil { return yaml, nil } lastErr = err if !errors.Is(err, ErrPullTransient) { return "", err // permanent (auth/not-found/other) — no retry } if attempt >= len(pullRetryDelays) { break // exhausted retries } delay := pullRetryDelays[attempt] logger.Printf("[INFO] bootstrap: hub unreachable (attempt %d), retrying in %s …", attempt+1, delay) time.Sleep(delay) } return "", lastErr } // mergeLocalAPI parses the pulled controller.yaml as a generic map, sets the local_api block from the // bootstrap (overwriting any hub-emitted placeholder), and re-marshals. local_api.enabled is NOT set // — it defaults on once endpoint is present (config.LocalAPIConfig). func mergeLocalAPI(pulledYAML string, la BootstrapLocalAPI) ([]byte, error) { m := map[string]any{} if err := yaml.Unmarshal([]byte(pulledYAML), &m); err != nil { return nil, fmt.Errorf("parse pulled yaml: %w", err) } m["local_api"] = map[string]any{ "endpoint": la.Endpoint, "fingerprint": la.Fingerprint, "token": la.Token, } out, err := yaml.Marshal(m) if err != nil { return nil, fmt.Errorf("marshal merged yaml: %w", err) } return out, nil } // writeFileAtomic writes b to path atomically (tmp + rename), 0600 (it carries the local-api token + // the customer hub key). func writeFileAtomic(path string, b []byte) error { if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { return fmt.Errorf("config dir: %w", err) } tmp := path + ".tmp" if err := os.WriteFile(tmp, b, 0o600); err != nil { return err } return os.Rename(tmp, path) }