v0.40.0: bootstrap pull+merge onboarding (controller pulls config from hub)
Fix the onboarding 401: instead of seeding controller.yaml from the agent's HOST hub key (which the hub's customer-scoped /api/v1/report rejects), the controller now PULLS its full controller.yaml from the hub on first boot using the bootstrap's retrieval passphrase (yielding the customer-scoped key) and MERGES in the per-guest local_api block. - internal/bootstrap: contract v1->v2 (customer.id + hub.url + hub.retrieval_password + local_api; drop host key/identity). MaybeIngest gains an injected PullFunc (keeps bootstrap free of the heavy report package), pulls with bounded transient-only retry, merges local_api at YAML-map level (preserves all hub-emitted fields), idempotent + fail-safe + never-crash. - main.go: wire report.PullConfig as the pull adapter (maps ErrHubUnreachable -> ErrPullTransient; auth/not-found permanent). - Lockstep with felhom-agent v0.19.0. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,17 +1,25 @@
|
||||
// Package bootstrap implements first-run bootstrap.json ingestion (slice 8A, doc 03 §6,
|
||||
// config-contract decision (c)). The host agent's provisioning back-half writes a stable
|
||||
// bootstrap.json into a read-only config mount; on first run the controller seeds its own
|
||||
// controller.yaml from it and comes up CONFIGURED, skipping the setup wizard. The agent emits
|
||||
// the stable contract; the controller owns the translation — the two stay decoupled.
|
||||
// Package bootstrap implements first-run bootstrap.json ingestion (slice 8A → v0.40.0 onboarding,
|
||||
// doc 03 §6, config-contract decision (c)/(d)). The host agent's provisioning back-half writes a
|
||||
// stable bootstrap.json into a read-only config mount carrying ONLY the customer id, the hub URL, a
|
||||
// per-customer RETRIEVAL PASSPHRASE, and the per-guest local-API handle. On first run the controller
|
||||
// uses the passphrase to PULL its full controller.yaml from the hub (which mints the customer-scoped
|
||||
// hub api_key + identity + assets + backup + CF config), MERGES in the per-guest local_api block (the
|
||||
// only thing the hub yaml lacks, because the hub must not know per-guest Proxmox internals), writes
|
||||
// it, and comes up CONFIGURED — skipping the setup wizard.
|
||||
//
|
||||
// This replaces the old "seed a configured yaml from the agent's HOST key" path, which made the
|
||||
// controller's hub reports 401 (the hub's /report needs the customer-scoped key, not the host key).
|
||||
package bootstrap
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||||
"gopkg.in/yaml.v3"
|
||||
@@ -21,31 +29,47 @@ import (
|
||||
// with FELHOM_BOOTSTRAP_PATH for tests / non-standard layouts.
|
||||
const DefaultMountPath = "/etc/felhom-bootstrap/bootstrap.json"
|
||||
|
||||
// SchemaV1 is the stable contract version the agent emits and the controller ingests.
|
||||
const SchemaV1 = "felhom.bootstrap/v1"
|
||||
// SchemaV2 is the stable contract version the agent emits and the controller ingests. v2 changed the
|
||||
// contract's MEANING (the controller pulls its config from the hub rather than seeding it from the
|
||||
// agent's host key) and its field set, so it is a clean version bump with no v1 back-compat
|
||||
// (pre-launch; zero v1 guests deployed). A non-v2 schema is rejected → setup mode.
|
||||
const SchemaV2 = "felhom.bootstrap/v2"
|
||||
|
||||
// Bootstrap is the stable agent→controller config contract (JSON). It carries exactly what the
|
||||
// controller needs to come up configured + reach the agent's local API. It is deliberately a
|
||||
// SEPARATE shape from controller.yaml (decision (c)): the agent never needs to know the
|
||||
// controller's full config schema.
|
||||
// ErrPullTransient marks a pull failure as retryable (a boot-time network race reaching the hub).
|
||||
// The wiring (main.go) wraps report.ErrHubUnreachable with this; permanent failures (auth /
|
||||
// not-found) are NOT wrapped, so MaybeIngest fails fast on them. Keeping this sentinel here (rather
|
||||
// than importing the heavy internal/report package) keeps bootstrap decoupled — decision (b).
|
||||
var ErrPullTransient = errors.New("bootstrap: transient pull failure")
|
||||
|
||||
// pullRetryDelays is the backoff between transient pull retries (one initial attempt + one retry per
|
||||
// entry → 4 attempts total on persistent transient failure). Overridable in tests for speed.
|
||||
var pullRetryDelays = []time.Duration{2 * time.Second, 4 * time.Second, 8 * time.Second}
|
||||
|
||||
// PullFunc fetches a generated controller.yaml from the hub for a customer, authenticated by the
|
||||
// retrieval passphrase. Injected (decision (b)) so MaybeIngest never imports internal/report; the
|
||||
// production wiring passes report.PullConfig. A transient (retryable) failure must be
|
||||
// errors.Is(err, ErrPullTransient); any other error is treated as permanent (no retry).
|
||||
type PullFunc func(hubURL, customerID, retrievalPassword string) (string, error)
|
||||
|
||||
// Bootstrap is the stable agent→controller config contract (JSON, schema v2). It carries ONLY what
|
||||
// the controller needs to PULL its config (customer id + hub url + retrieval passphrase) and reach
|
||||
// the agent's local API (endpoint/fingerprint/token). It is deliberately a SEPARATE shape from
|
||||
// controller.yaml: the agent never needs to know the controller's full config schema, and never
|
||||
// holds the customer-scoped hub key or CF tokens (those come from the hub pull).
|
||||
type Bootstrap struct {
|
||||
Schema string `json:"schema"`
|
||||
Customer BootstrapCustomer `json:"customer"`
|
||||
Customer BootstrapCustomer `json:"customer"` // only id (the pull target); the hub provides name/domain/email
|
||||
Hub BootstrapHub `json:"hub"`
|
||||
LocalAPI BootstrapLocalAPI `json:"local_api"`
|
||||
}
|
||||
|
||||
type BootstrapCustomer struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Domain string `json:"domain"`
|
||||
Email string `json:"email"`
|
||||
ID string `json:"id"`
|
||||
}
|
||||
|
||||
type BootstrapHub struct {
|
||||
URL string `json:"url"`
|
||||
APIKey string `json:"api_key"`
|
||||
HostID string `json:"host_id"` // the agent's host id (reference; not load-bearing for the controller)
|
||||
URL string `json:"url"`
|
||||
RetrievalPassword string `json:"retrieval_password"` // SECRET — pulls the full config (incl. the customer key)
|
||||
}
|
||||
|
||||
type BootstrapLocalAPI struct {
|
||||
@@ -62,18 +86,20 @@ func Path() string {
|
||||
return DefaultMountPath
|
||||
}
|
||||
|
||||
// MaybeIngest seeds controller.yaml from a bootstrap.json mount when the controller is NOT yet
|
||||
// configured, and returns the config the caller should use.
|
||||
// MaybeIngest, on an unconfigured controller, pulls the full controller.yaml from the hub (using the
|
||||
// bootstrap's retrieval passphrase), merges in the per-guest local_api block, writes controller.yaml,
|
||||
// and returns the reloaded config. Returns the config the caller should use.
|
||||
//
|
||||
// Contract:
|
||||
// - Idempotent: if cfg is already configured (customer.id set), the existing controller.yaml is
|
||||
// NEVER clobbered — returns cfg unchanged.
|
||||
// - Fail-safe: an absent or malformed bootstrap, or one missing the minimum identity, leaves cfg
|
||||
// unchanged (the caller proceeds to normal setup mode) — it logs and never crashes.
|
||||
// NEVER clobbered and the hub is NEVER pulled — returns cfg unchanged.
|
||||
// - Fail-safe: an absent/malformed bootstrap, a non-v2 schema, a missing required field, or a hub
|
||||
// pull that ultimately fails leaves cfg unchanged (the caller proceeds to setup mode). It logs
|
||||
// and NEVER crashes — a hub outage at first boot must not brick the guest.
|
||||
// - On success: writes controller.yaml (0600, atomic), reloads it, and returns the reloaded cfg.
|
||||
func MaybeIngest(configPath string, cfg *config.Config, logger *log.Logger) *config.Config {
|
||||
func MaybeIngest(configPath string, cfg *config.Config, logger *log.Logger, pull PullFunc) *config.Config {
|
||||
if cfg != nil && cfg.Customer.ID != "" {
|
||||
return cfg // already configured — do not clobber (idempotent)
|
||||
return cfg // already configured — do not clobber, do not pull (idempotent)
|
||||
}
|
||||
bpath := Path()
|
||||
data, err := os.ReadFile(bpath)
|
||||
@@ -89,17 +115,39 @@ func MaybeIngest(configPath string, cfg *config.Config, logger *log.Logger) *con
|
||||
logger.Printf("[WARN] bootstrap: %s is not valid JSON: %v — staying in setup", bpath, err)
|
||||
return cfg
|
||||
}
|
||||
if b.Schema != "" && b.Schema != SchemaV1 {
|
||||
logger.Printf("[WARN] bootstrap: unsupported schema %q (want %q) — staying in setup", b.Schema, SchemaV1)
|
||||
if b.Schema != SchemaV2 {
|
||||
logger.Printf("[WARN] bootstrap: unsupported schema %q (want %q) — staying in setup", b.Schema, SchemaV2)
|
||||
return cfg
|
||||
}
|
||||
if b.Customer.ID == "" || b.Customer.Domain == "" {
|
||||
logger.Printf("[WARN] bootstrap: %s missing customer.id/domain — staying in setup", bpath)
|
||||
if b.Customer.ID == "" || b.Hub.URL == "" || b.Hub.RetrievalPassword == "" {
|
||||
logger.Printf("[WARN] bootstrap: %s missing customer.id / hub.url / hub.retrieval_password — staying in setup", bpath)
|
||||
return cfg
|
||||
}
|
||||
if b.LocalAPI.Endpoint == "" || b.LocalAPI.Fingerprint == "" || b.LocalAPI.Token == "" {
|
||||
logger.Printf("[WARN] bootstrap: %s missing local_api.{endpoint,fingerprint,token} — staying in setup", bpath)
|
||||
return cfg
|
||||
}
|
||||
if pull == nil {
|
||||
logger.Printf("[WARN] bootstrap: no pull function wired — staying in setup")
|
||||
return cfg
|
||||
}
|
||||
|
||||
seeded := configFromBootstrap(b)
|
||||
if err := writeYAML(configPath, seeded); err != nil {
|
||||
// --- Pull the full controller.yaml from the hub, with bounded retry on transient errors only. ---
|
||||
pulled, err := pullWithRetry(pull, b.Hub.URL, b.Customer.ID, b.Hub.RetrievalPassword, logger)
|
||||
if err != nil {
|
||||
logger.Printf("[WARN] bootstrap: hub config pull failed for customer %s from %s: %v — staying in setup (manual setup wizard remains the fallback)",
|
||||
b.Customer.ID, b.Hub.URL, err)
|
||||
return cfg
|
||||
}
|
||||
|
||||
// --- Merge the per-guest local_api block into the hub yaml at the MAP level (decision (c)) so
|
||||
// every field the hub emits is preserved (forward-compat with hub template changes). ---
|
||||
merged, err := mergeLocalAPI(pulled, b.LocalAPI)
|
||||
if err != nil {
|
||||
logger.Printf("[WARN] bootstrap: merging local_api into pulled config failed: %v — staying in setup", err)
|
||||
return cfg
|
||||
}
|
||||
if err := writeFileAtomic(configPath, merged); err != nil {
|
||||
logger.Printf("[WARN] bootstrap: could not write %s: %v — staying in setup", configPath, err)
|
||||
return cfg
|
||||
}
|
||||
@@ -108,43 +156,62 @@ func MaybeIngest(configPath string, cfg *config.Config, logger *log.Logger) *con
|
||||
logger.Printf("[WARN] bootstrap: wrote %s but reload failed: %v — staying in setup", configPath, err)
|
||||
return cfg
|
||||
}
|
||||
logger.Printf("[INFO] bootstrap: seeded %s from %s (customer=%s, local_api=%s) — coming up configured",
|
||||
configPath, bpath, b.Customer.ID, b.LocalAPI.Endpoint)
|
||||
logger.Printf("[INFO] bootstrap: pulled config from %s for %s, merged local_api (%s) — coming up configured",
|
||||
b.Hub.URL, b.Customer.ID, b.LocalAPI.Endpoint)
|
||||
return reloaded
|
||||
}
|
||||
|
||||
// configFromBootstrap maps the stable contract onto a controller.yaml Config. Only the
|
||||
// identity/hub/local-api fields are seeded; all other config keeps controller defaults (the
|
||||
// customer configures the rest via the dashboard / hub manifest).
|
||||
func configFromBootstrap(b Bootstrap) *config.Config {
|
||||
cfg := &config.Config{}
|
||||
cfg.Customer.ID = b.Customer.ID
|
||||
cfg.Customer.Name = b.Customer.Name
|
||||
cfg.Customer.Domain = b.Customer.Domain
|
||||
cfg.Customer.Email = b.Customer.Email
|
||||
if b.Hub.URL != "" {
|
||||
cfg.Hub.Enabled = b.Hub.APIKey != ""
|
||||
cfg.Hub.URL = b.Hub.URL
|
||||
cfg.Hub.APIKey = b.Hub.APIKey
|
||||
// pullWithRetry calls pull once, then retries on transient (ErrPullTransient) failures only, with
|
||||
// the pullRetryDelays backoff. Permanent failures (anything not ErrPullTransient) fail fast.
|
||||
func pullWithRetry(pull PullFunc, hubURL, customerID, password string, logger *log.Logger) (string, error) {
|
||||
var lastErr error
|
||||
for attempt := 0; ; attempt++ {
|
||||
yaml, err := pull(hubURL, customerID, password)
|
||||
if err == nil {
|
||||
return yaml, nil
|
||||
}
|
||||
lastErr = err
|
||||
if !errors.Is(err, ErrPullTransient) {
|
||||
return "", err // permanent (auth/not-found/other) — no retry
|
||||
}
|
||||
if attempt >= len(pullRetryDelays) {
|
||||
break // exhausted retries
|
||||
}
|
||||
delay := pullRetryDelays[attempt]
|
||||
logger.Printf("[INFO] bootstrap: hub unreachable (attempt %d), retrying in %s …", attempt+1, delay)
|
||||
time.Sleep(delay)
|
||||
}
|
||||
cfg.LocalAPI.Endpoint = b.LocalAPI.Endpoint
|
||||
cfg.LocalAPI.Fingerprint = b.LocalAPI.Fingerprint
|
||||
cfg.LocalAPI.Token = b.LocalAPI.Token
|
||||
return cfg
|
||||
return "", lastErr
|
||||
}
|
||||
|
||||
// writeYAML marshals cfg to YAML and writes it atomically (tmp + rename), 0600 (it carries the
|
||||
// local-api token + any hub key).
|
||||
func writeYAML(path string, cfg *config.Config) error {
|
||||
out, err := yaml.Marshal(cfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal: %w", err)
|
||||
// mergeLocalAPI parses the pulled controller.yaml as a generic map, sets the local_api block from the
|
||||
// bootstrap (overwriting any hub-emitted placeholder), and re-marshals. local_api.enabled is NOT set
|
||||
// — it defaults on once endpoint is present (config.LocalAPIConfig).
|
||||
func mergeLocalAPI(pulledYAML string, la BootstrapLocalAPI) ([]byte, error) {
|
||||
m := map[string]any{}
|
||||
if err := yaml.Unmarshal([]byte(pulledYAML), &m); err != nil {
|
||||
return nil, fmt.Errorf("parse pulled yaml: %w", err)
|
||||
}
|
||||
m["local_api"] = map[string]any{
|
||||
"endpoint": la.Endpoint,
|
||||
"fingerprint": la.Fingerprint,
|
||||
"token": la.Token,
|
||||
}
|
||||
out, err := yaml.Marshal(m)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("marshal merged yaml: %w", err)
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// writeFileAtomic writes b to path atomically (tmp + rename), 0600 (it carries the local-api token +
|
||||
// the customer hub key).
|
||||
func writeFileAtomic(path string, b []byte) error {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
|
||||
return fmt.Errorf("config dir: %w", err)
|
||||
}
|
||||
tmp := path + ".tmp"
|
||||
if err := os.WriteFile(tmp, out, 0o600); err != nil {
|
||||
if err := os.WriteFile(tmp, b, 0o600); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.Rename(tmp, path)
|
||||
|
||||
Reference in New Issue
Block a user