ab77fa3544
internal/hub: the agent's first daemon — a periodic read-only host-report POSTed to the hub (the heartbeat; no separate ping). - HostReport wire contract (shared field-for-field with the hub ingest): host metrics, guests (vmid + spec), cloudflared status; storage/backups/restore-tests/ pbs/audit collections DEFINED but emitted empty (slices 5/6 fill). - Collector over a read-only proxmoxReader (adapted to the real proxmox surface; no proxmox changes) + a CloudflaredProber. Partial-failure: NodeStatus fail = hard (skip POST); per-guest GuestConfig fail = status "unknown", still report. - Client: Bearer-auth POST, standard TLS (system roots / optional ca_file), typed TransportError/HTTPError, token never in errors. - Loop: immediate first report, adopt hub poll_interval (clamp [60,3600]), resilient to collect/report errors, clean ctx-cancel shutdown. - ControlEnvelope: only poll_interval_seconds acted on; blocked/desired_generation/ has_signed_ops parsed-but-ignored (slice 4). - config: HubConfig + FELHOM_AGENT_HUB_* overlay + mode-aware HubConfig.Validate + WithDefaults + hub-key redaction; example config updated. - main: no-selftest mode is now the daemon; added --selftest=hub. Version -> 0.3.0. Tests: report serialization, client (incl. token-redaction), collector partial- failure, loop continuation+interval adoption, config. internal/proxmox + internal/ authz untouched. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
272 lines
9.3 KiB
Go
272 lines
9.3 KiB
Go
// Package config loads the felhom-agent configuration the proxmox layer needs.
|
|
//
|
|
// Format: a JSON file (stdlib-only — no YAML dep, consistent with the agent's
|
|
// "pure stdlib" constraint), with per-field environment overrides. Secrets (the
|
|
// API token) are never logged; see Config.Redacted.
|
|
//
|
|
// OPEN item (noted in the slice reply): the controller/hub use YAML; if matching
|
|
// that house style is preferred over the zero-dependency constraint, the loader
|
|
// can swap to yaml.v3 without touching call sites.
|
|
package config
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"net"
|
|
"net/url"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
// Config is the agent configuration.
|
|
type Config struct {
|
|
Proxmox ProxmoxConfig `json:"proxmox"`
|
|
Privileged PrivilegedConfig `json:"privileged"`
|
|
Authz AuthzConfig `json:"authz"`
|
|
Hub HubConfig `json:"hub"`
|
|
LogLevel string `json:"log_level"` // debug|info|warn|error (default info)
|
|
}
|
|
|
|
// HubConfig configures the outbound hub client + daemon poll loop (internal/hub).
|
|
// The hub serves a real cert (hub.felhom.eu, cert-manager) — this is standard TLS
|
|
// (system roots), NOT the Proxmox fingerprint-pinning path.
|
|
type HubConfig struct {
|
|
URL string `json:"url"` // e.g. "https://hub.felhom.eu"
|
|
HostID string `json:"host_id"` // the hub's PK for this host
|
|
APIKey string `json:"api_key"` // per-host hub key; SECRET — redacted
|
|
PollSeconds int `json:"poll_seconds"` // default 900; hub may override per-cycle
|
|
TimeoutSeconds int `json:"timeout_seconds"` // per-request HTTP timeout; default 30
|
|
CAFile string `json:"ca_file"` // optional; "" = system roots
|
|
}
|
|
|
|
// AuthzConfig configures operator-signed-op verification (internal/authz). The
|
|
// pinned operator public keys are kept here as raw authorized_keys-style lines
|
|
// (this package stays dependency-free); the authz package parses them into its
|
|
// AllowedSigner set. Role-scoping (recovery keys authorize only key-rotation) is
|
|
// enforced by the consuming layer, not loaded here.
|
|
type AuthzConfig struct {
|
|
// NonceStorePath is the durable, crash-safe nonce log (anti-replay). Must be on
|
|
// persistent host storage so replay protection survives agent restarts.
|
|
NonceStorePath string `json:"nonce_store_path"`
|
|
// Signers are the pinned operator public keys (doc 04 §3 two-key model).
|
|
Signers []SignerKey `json:"signers"`
|
|
}
|
|
|
|
// SignerKey is one pinned operator public key.
|
|
type SignerKey struct {
|
|
KeyID string `json:"key_id"`
|
|
// Role is "operational" (signs destructive ops) or "recovery" (cold key;
|
|
// authorizes only key-rotation/break-glass).
|
|
Role string `json:"role"`
|
|
// PublicKey is a standard authorized_keys line, e.g.
|
|
// "ssh-ed25519 AAAA… felhom-op-1" or "sk-ssh-ed25519@openssh.com AAAA… …".
|
|
PublicKey string `json:"public_key"`
|
|
}
|
|
|
|
// ProxmoxConfig configures the API client.
|
|
type ProxmoxConfig struct {
|
|
// Endpoint defaults to https://127.0.0.1:8006 (agent runs on the host).
|
|
Endpoint string `json:"endpoint"`
|
|
// Node is the Proxmox node name; confirm on the box (GET /nodes).
|
|
Node string `json:"node"`
|
|
// Token is the full API token "USER@REALM!TOKENID=SECRET".
|
|
//
|
|
// Provisioning note: this is a privilege-SEPARATED token. Its role
|
|
// (FelhomAgent, 16 privileges) must be granted on BOTH the user AND the token
|
|
// for the same path, or the intersection is empty and every call 403s
|
|
// (phase1-2 §1.2). Role setup is out-of-band; the agent only consumes the token.
|
|
Token string `json:"token"`
|
|
// TLS trust to the host's (self-signed) cert.
|
|
TLS TLSTrust `json:"tls"`
|
|
}
|
|
|
|
// TLSTrust mirrors proxmox.TLSConfig (kept dependency-free here).
|
|
type TLSTrust struct {
|
|
CAFile string `json:"ca_file"`
|
|
Fingerprint string `json:"fingerprint"` // SHA-256 of the host leaf cert
|
|
InsecureSkipVerify bool `json:"insecure_skip_verify"` // off by default; selftest-only
|
|
}
|
|
|
|
// PrivilegedConfig configures the fenced root-CLI runner.
|
|
type PrivilegedConfig struct {
|
|
// Mode: "sudo" (default — non-root agent + narrow sudoers) or "direct".
|
|
Mode string `json:"mode"`
|
|
// SudoPath overrides the sudo binary (default "sudo").
|
|
SudoPath string `json:"sudo_path"`
|
|
}
|
|
|
|
// Default returns a Config pre-populated with sane defaults.
|
|
func Default() Config {
|
|
return Config{
|
|
Proxmox: ProxmoxConfig{Endpoint: "https://127.0.0.1:8006"},
|
|
Privileged: PrivilegedConfig{Mode: "sudo"},
|
|
Authz: AuthzConfig{NonceStorePath: "/var/lib/felhom-agent/nonces.log"},
|
|
Hub: HubConfig{PollSeconds: 900, TimeoutSeconds: 30},
|
|
LogLevel: "info",
|
|
}
|
|
}
|
|
|
|
// Load reads the config file at path (if non-empty) over the defaults, then
|
|
// applies environment overrides. A missing path with all-env config is allowed.
|
|
func Load(path string) (Config, error) {
|
|
cfg := Default()
|
|
if path != "" {
|
|
b, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return cfg, fmt.Errorf("config: reading %s: %w", path, err)
|
|
}
|
|
if err := json.Unmarshal(b, &cfg); err != nil {
|
|
return cfg, fmt.Errorf("config: parsing %s: %w", path, err)
|
|
}
|
|
}
|
|
applyEnv(&cfg)
|
|
return cfg, nil
|
|
}
|
|
|
|
// applyEnv overlays FELHOM_AGENT_* environment variables. Useful for the token in
|
|
// particular (keep the secret out of the file on disk if desired).
|
|
func applyEnv(cfg *Config) {
|
|
if v := os.Getenv("FELHOM_AGENT_PROXMOX_ENDPOINT"); v != "" {
|
|
cfg.Proxmox.Endpoint = v
|
|
}
|
|
if v := os.Getenv("FELHOM_AGENT_PROXMOX_NODE"); v != "" {
|
|
cfg.Proxmox.Node = v
|
|
}
|
|
if v := os.Getenv("FELHOM_AGENT_PROXMOX_TOKEN"); v != "" {
|
|
cfg.Proxmox.Token = v
|
|
}
|
|
if v := os.Getenv("FELHOM_AGENT_PROXMOX_TLS_CA_FILE"); v != "" {
|
|
cfg.Proxmox.TLS.CAFile = v
|
|
}
|
|
if v := os.Getenv("FELHOM_AGENT_PROXMOX_TLS_FINGERPRINT"); v != "" {
|
|
cfg.Proxmox.TLS.Fingerprint = v
|
|
}
|
|
if v := os.Getenv("FELHOM_AGENT_PROXMOX_TLS_INSECURE"); v != "" {
|
|
if b, err := strconv.ParseBool(v); err == nil {
|
|
cfg.Proxmox.TLS.InsecureSkipVerify = b
|
|
}
|
|
}
|
|
if v := os.Getenv("FELHOM_AGENT_LOG_LEVEL"); v != "" {
|
|
cfg.LogLevel = v
|
|
}
|
|
// hub
|
|
if v := os.Getenv("FELHOM_AGENT_HUB_URL"); v != "" {
|
|
cfg.Hub.URL = v
|
|
}
|
|
if v := os.Getenv("FELHOM_AGENT_HUB_HOST_ID"); v != "" {
|
|
cfg.Hub.HostID = v
|
|
}
|
|
if v := os.Getenv("FELHOM_AGENT_HUB_API_KEY"); v != "" {
|
|
cfg.Hub.APIKey = v
|
|
}
|
|
if v := os.Getenv("FELHOM_AGENT_HUB_CA_FILE"); v != "" {
|
|
cfg.Hub.CAFile = v
|
|
}
|
|
cfg.Hub.PollSeconds = envInt("FELHOM_AGENT_HUB_POLL_SECONDS", cfg.Hub.PollSeconds)
|
|
cfg.Hub.TimeoutSeconds = envInt("FELHOM_AGENT_HUB_TIMEOUT_SECONDS", cfg.Hub.TimeoutSeconds)
|
|
}
|
|
|
|
// envInt overlays an int env var, keeping cur (with a stderr warning) on parse
|
|
// error rather than crashing. (Load runs before the slog logger exists.)
|
|
func envInt(key string, cur int) int {
|
|
v := os.Getenv(key)
|
|
if v == "" {
|
|
return cur
|
|
}
|
|
n, err := strconv.Atoi(v)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "config: %s=%q is not an integer, keeping %d\n", key, v, cur)
|
|
return cur
|
|
}
|
|
return n
|
|
}
|
|
|
|
// Validate checks the config is usable for talking to the API.
|
|
func (c Config) Validate() error {
|
|
if c.Proxmox.Endpoint == "" {
|
|
return fmt.Errorf("config: proxmox.endpoint is required")
|
|
}
|
|
if c.Proxmox.Node == "" {
|
|
return fmt.Errorf("config: proxmox.node is required (confirm with `pvesh get /nodes`)")
|
|
}
|
|
if c.Proxmox.Token == "" {
|
|
return fmt.Errorf("config: proxmox.token is required (set proxmox.token or FELHOM_AGENT_PROXMOX_TOKEN)")
|
|
}
|
|
if !strings.Contains(c.Proxmox.Token, "!") || !strings.Contains(c.Proxmox.Token, "=") {
|
|
return fmt.Errorf("config: proxmox.token must be USER@REALM!TOKENID=SECRET")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Redacted returns a copy safe to log: the proxmox token and hub key are masked.
|
|
func (c Config) Redacted() Config {
|
|
if c.Proxmox.Token != "" {
|
|
c.Proxmox.Token = redactToken(c.Proxmox.Token)
|
|
}
|
|
if c.Hub.APIKey != "" {
|
|
c.Hub.APIKey = "********"
|
|
}
|
|
return c
|
|
}
|
|
|
|
// WithDefaults fills zero-valued hub timing fields. Applied at client/loop
|
|
// construction so programmatic configs (not from Default()) still get sane values.
|
|
func (h HubConfig) WithDefaults() HubConfig {
|
|
if h.PollSeconds == 0 {
|
|
h.PollSeconds = 900
|
|
}
|
|
if h.TimeoutSeconds == 0 {
|
|
h.TimeoutSeconds = 30
|
|
}
|
|
return h
|
|
}
|
|
|
|
// Validate checks the hub config is usable for the daemon / --selftest=hub. It is
|
|
// separate from Config.Validate (proxmox-only) so --selftest=read|task still runs
|
|
// without hub config.
|
|
func (h HubConfig) Validate() error {
|
|
if h.URL == "" {
|
|
return fmt.Errorf("config: hub.url is required (set hub.url or FELHOM_AGENT_HUB_URL)")
|
|
}
|
|
if h.HostID == "" {
|
|
return fmt.Errorf("config: hub.host_id is required")
|
|
}
|
|
if h.APIKey == "" {
|
|
return fmt.Errorf("config: hub.api_key is required (set hub.api_key or FELHOM_AGENT_HUB_API_KEY)")
|
|
}
|
|
u, err := url.Parse(h.URL)
|
|
if err != nil {
|
|
return fmt.Errorf("config: hub.url is not a valid URL: %w", err)
|
|
}
|
|
switch u.Scheme {
|
|
case "https":
|
|
// always fine
|
|
case "http":
|
|
if !isLoopbackHost(u.Hostname()) {
|
|
return fmt.Errorf("config: hub.url must be https:// (http:// only allowed for loopback in tests)")
|
|
}
|
|
default:
|
|
return fmt.Errorf("config: hub.url must be https:// (got scheme %q)", u.Scheme)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func isLoopbackHost(host string) bool {
|
|
if host == "localhost" {
|
|
return true
|
|
}
|
|
if ip := net.ParseIP(host); ip != nil {
|
|
return ip.IsLoopback()
|
|
}
|
|
return false
|
|
}
|
|
|
|
// redactToken keeps the public "USER@REALM!TOKENID=" prefix and masks the secret.
|
|
func redactToken(tok string) string {
|
|
if i := strings.LastIndex(tok, "="); i >= 0 {
|
|
return tok[:i+1] + "********"
|
|
}
|
|
return "********"
|
|
}
|