feat(hub): host-report client + collector + first daemon loop (slice 3, v0.3.0)
internal/hub: the agent's first daemon — a periodic read-only host-report POSTed to the hub (the heartbeat; no separate ping). - HostReport wire contract (shared field-for-field with the hub ingest): host metrics, guests (vmid + spec), cloudflared status; storage/backups/restore-tests/ pbs/audit collections DEFINED but emitted empty (slices 5/6 fill). - Collector over a read-only proxmoxReader (adapted to the real proxmox surface; no proxmox changes) + a CloudflaredProber. Partial-failure: NodeStatus fail = hard (skip POST); per-guest GuestConfig fail = status "unknown", still report. - Client: Bearer-auth POST, standard TLS (system roots / optional ca_file), typed TransportError/HTTPError, token never in errors. - Loop: immediate first report, adopt hub poll_interval (clamp [60,3600]), resilient to collect/report errors, clean ctx-cancel shutdown. - ControlEnvelope: only poll_interval_seconds acted on; blocked/desired_generation/ has_signed_ops parsed-but-ignored (slice 4). - config: HubConfig + FELHOM_AGENT_HUB_* overlay + mode-aware HubConfig.Validate + WithDefaults + hub-key redaction; example config updated. - main: no-selftest mode is now the daemon; added --selftest=hub. Version -> 0.3.0. Tests: report serialization, client (incl. token-redaction), collector partial- failure, loop continuation+interval adoption, config. internal/proxmox + internal/ authz untouched. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+102
-1
@@ -12,6 +12,8 @@ package config
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/url"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -22,9 +24,22 @@ type Config struct {
|
||||
Proxmox ProxmoxConfig `json:"proxmox"`
|
||||
Privileged PrivilegedConfig `json:"privileged"`
|
||||
Authz AuthzConfig `json:"authz"`
|
||||
Hub HubConfig `json:"hub"`
|
||||
LogLevel string `json:"log_level"` // debug|info|warn|error (default info)
|
||||
}
|
||||
|
||||
// HubConfig configures the outbound hub client + daemon poll loop (internal/hub).
|
||||
// The hub serves a real cert (hub.felhom.eu, cert-manager) — this is standard TLS
|
||||
// (system roots), NOT the Proxmox fingerprint-pinning path.
|
||||
type HubConfig struct {
|
||||
URL string `json:"url"` // e.g. "https://hub.felhom.eu"
|
||||
HostID string `json:"host_id"` // the hub's PK for this host
|
||||
APIKey string `json:"api_key"` // per-host hub key; SECRET — redacted
|
||||
PollSeconds int `json:"poll_seconds"` // default 900; hub may override per-cycle
|
||||
TimeoutSeconds int `json:"timeout_seconds"` // per-request HTTP timeout; default 30
|
||||
CAFile string `json:"ca_file"` // optional; "" = system roots
|
||||
}
|
||||
|
||||
// AuthzConfig configures operator-signed-op verification (internal/authz). The
|
||||
// pinned operator public keys are kept here as raw authorized_keys-style lines
|
||||
// (this package stays dependency-free); the authz package parses them into its
|
||||
@@ -87,6 +102,7 @@ func Default() Config {
|
||||
Proxmox: ProxmoxConfig{Endpoint: "https://127.0.0.1:8006"},
|
||||
Privileged: PrivilegedConfig{Mode: "sudo"},
|
||||
Authz: AuthzConfig{NonceStorePath: "/var/lib/felhom-agent/nonces.log"},
|
||||
Hub: HubConfig{PollSeconds: 900, TimeoutSeconds: 30},
|
||||
LogLevel: "info",
|
||||
}
|
||||
}
|
||||
@@ -134,6 +150,36 @@ func applyEnv(cfg *Config) {
|
||||
if v := os.Getenv("FELHOM_AGENT_LOG_LEVEL"); v != "" {
|
||||
cfg.LogLevel = v
|
||||
}
|
||||
// hub
|
||||
if v := os.Getenv("FELHOM_AGENT_HUB_URL"); v != "" {
|
||||
cfg.Hub.URL = v
|
||||
}
|
||||
if v := os.Getenv("FELHOM_AGENT_HUB_HOST_ID"); v != "" {
|
||||
cfg.Hub.HostID = v
|
||||
}
|
||||
if v := os.Getenv("FELHOM_AGENT_HUB_API_KEY"); v != "" {
|
||||
cfg.Hub.APIKey = v
|
||||
}
|
||||
if v := os.Getenv("FELHOM_AGENT_HUB_CA_FILE"); v != "" {
|
||||
cfg.Hub.CAFile = v
|
||||
}
|
||||
cfg.Hub.PollSeconds = envInt("FELHOM_AGENT_HUB_POLL_SECONDS", cfg.Hub.PollSeconds)
|
||||
cfg.Hub.TimeoutSeconds = envInt("FELHOM_AGENT_HUB_TIMEOUT_SECONDS", cfg.Hub.TimeoutSeconds)
|
||||
}
|
||||
|
||||
// envInt overlays an int env var, keeping cur (with a stderr warning) on parse
|
||||
// error rather than crashing. (Load runs before the slog logger exists.)
|
||||
func envInt(key string, cur int) int {
|
||||
v := os.Getenv(key)
|
||||
if v == "" {
|
||||
return cur
|
||||
}
|
||||
n, err := strconv.Atoi(v)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "config: %s=%q is not an integer, keeping %d\n", key, v, cur)
|
||||
return cur
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// Validate checks the config is usable for talking to the API.
|
||||
@@ -153,14 +199,69 @@ func (c Config) Validate() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Redacted returns a copy safe to log: the token secret is masked.
|
||||
// Redacted returns a copy safe to log: the proxmox token and hub key are masked.
|
||||
func (c Config) Redacted() Config {
|
||||
if c.Proxmox.Token != "" {
|
||||
c.Proxmox.Token = redactToken(c.Proxmox.Token)
|
||||
}
|
||||
if c.Hub.APIKey != "" {
|
||||
c.Hub.APIKey = "********"
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// WithDefaults fills zero-valued hub timing fields. Applied at client/loop
|
||||
// construction so programmatic configs (not from Default()) still get sane values.
|
||||
func (h HubConfig) WithDefaults() HubConfig {
|
||||
if h.PollSeconds == 0 {
|
||||
h.PollSeconds = 900
|
||||
}
|
||||
if h.TimeoutSeconds == 0 {
|
||||
h.TimeoutSeconds = 30
|
||||
}
|
||||
return h
|
||||
}
|
||||
|
||||
// Validate checks the hub config is usable for the daemon / --selftest=hub. It is
|
||||
// separate from Config.Validate (proxmox-only) so --selftest=read|task still runs
|
||||
// without hub config.
|
||||
func (h HubConfig) Validate() error {
|
||||
if h.URL == "" {
|
||||
return fmt.Errorf("config: hub.url is required (set hub.url or FELHOM_AGENT_HUB_URL)")
|
||||
}
|
||||
if h.HostID == "" {
|
||||
return fmt.Errorf("config: hub.host_id is required")
|
||||
}
|
||||
if h.APIKey == "" {
|
||||
return fmt.Errorf("config: hub.api_key is required (set hub.api_key or FELHOM_AGENT_HUB_API_KEY)")
|
||||
}
|
||||
u, err := url.Parse(h.URL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("config: hub.url is not a valid URL: %w", err)
|
||||
}
|
||||
switch u.Scheme {
|
||||
case "https":
|
||||
// always fine
|
||||
case "http":
|
||||
if !isLoopbackHost(u.Hostname()) {
|
||||
return fmt.Errorf("config: hub.url must be https:// (http:// only allowed for loopback in tests)")
|
||||
}
|
||||
default:
|
||||
return fmt.Errorf("config: hub.url must be https:// (got scheme %q)", u.Scheme)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func isLoopbackHost(host string) bool {
|
||||
if host == "localhost" {
|
||||
return true
|
||||
}
|
||||
if ip := net.ParseIP(host); ip != nil {
|
||||
return ip.IsLoopback()
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// redactToken keeps the public "USER@REALM!TOKENID=" prefix and masks the secret.
|
||||
func redactToken(tok string) string {
|
||||
if i := strings.LastIndex(tok, "="); i >= 0 {
|
||||
|
||||
@@ -36,6 +36,61 @@ func TestValidate(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRedactedMasksHubKey(t *testing.T) {
|
||||
c := Default()
|
||||
c.Hub.APIKey = "hub-secret-abcdef"
|
||||
if got := c.Redacted().Hub.APIKey; got == "hub-secret-abcdef" || got == "" {
|
||||
t.Fatalf("hub key not masked: %q", got)
|
||||
}
|
||||
if !strings.Contains(c.Hub.APIKey, "abcdef") {
|
||||
t.Error("Redacted mutated the original hub key")
|
||||
}
|
||||
}
|
||||
|
||||
func TestHubConfigValidate(t *testing.T) {
|
||||
base := HubConfig{URL: "https://hub.felhom.eu", HostID: "h1", APIKey: "k"}
|
||||
if err := base.Validate(); err != nil {
|
||||
t.Fatalf("valid hub config rejected: %v", err)
|
||||
}
|
||||
bad := []HubConfig{
|
||||
{HostID: "h", APIKey: "k"}, // no URL
|
||||
{URL: "https://x", APIKey: "k"}, // no host
|
||||
{URL: "https://x", HostID: "h"}, // no key
|
||||
{URL: "http://hub.felhom.eu", HostID: "h", APIKey: "k"}, // http non-loopback
|
||||
{URL: "ftp://x", HostID: "h", APIKey: "k"}, // bad scheme
|
||||
}
|
||||
for i, h := range bad {
|
||||
if err := h.Validate(); err == nil {
|
||||
t.Errorf("case %d: expected validation error for %+v", i, h)
|
||||
}
|
||||
}
|
||||
// http is allowed for loopback (tests).
|
||||
if err := (HubConfig{URL: "http://127.0.0.1:8443", HostID: "h", APIKey: "k"}).Validate(); err != nil {
|
||||
t.Errorf("http loopback should be allowed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHubEnvOverlayAndDefaults(t *testing.T) {
|
||||
t.Setenv("FELHOM_AGENT_HUB_URL", "https://hub.example")
|
||||
t.Setenv("FELHOM_AGENT_HUB_HOST_ID", "env-host")
|
||||
t.Setenv("FELHOM_AGENT_HUB_API_KEY", "env-key")
|
||||
t.Setenv("FELHOM_AGENT_HUB_POLL_SECONDS", "120")
|
||||
cfg, err := Load("")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if cfg.Hub.URL != "https://hub.example" || cfg.Hub.HostID != "env-host" || cfg.Hub.APIKey != "env-key" {
|
||||
t.Errorf("hub env overlay failed: %+v", cfg.Hub)
|
||||
}
|
||||
if cfg.Hub.PollSeconds != 120 {
|
||||
t.Errorf("poll seconds = %d, want 120", cfg.Hub.PollSeconds)
|
||||
}
|
||||
// withDefaults fills zero timeout.
|
||||
if (HubConfig{}).WithDefaults().TimeoutSeconds != 30 {
|
||||
t.Error("WithDefaults should set TimeoutSeconds=30")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadFileThenEnvOverride(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "agent.json")
|
||||
|
||||
Reference in New Issue
Block a user