feat(hub): host-report client + collector + first daemon loop (slice 3, v0.3.0)

internal/hub: the agent's first daemon — a periodic read-only host-report POSTed to
the hub (the heartbeat; no separate ping).

- HostReport wire contract (shared field-for-field with the hub ingest): host
  metrics, guests (vmid + spec), cloudflared status; storage/backups/restore-tests/
  pbs/audit collections DEFINED but emitted empty (slices 5/6 fill).
- Collector over a read-only proxmoxReader (adapted to the real proxmox surface;
  no proxmox changes) + a CloudflaredProber. Partial-failure: NodeStatus fail = hard
  (skip POST); per-guest GuestConfig fail = status "unknown", still report.
- Client: Bearer-auth POST, standard TLS (system roots / optional ca_file), typed
  TransportError/HTTPError, token never in errors.
- Loop: immediate first report, adopt hub poll_interval (clamp [60,3600]), resilient
  to collect/report errors, clean ctx-cancel shutdown.
- ControlEnvelope: only poll_interval_seconds acted on; blocked/desired_generation/
  has_signed_ops parsed-but-ignored (slice 4).
- config: HubConfig + FELHOM_AGENT_HUB_* overlay + mode-aware HubConfig.Validate +
  WithDefaults + hub-key redaction; example config updated.
- main: no-selftest mode is now the daemon; added --selftest=hub. Version -> 0.3.0.

Tests: report serialization, client (incl. token-redaction), collector partial-
failure, loop continuation+interval adoption, config. internal/proxmox + internal/
authz untouched.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-08 16:20:09 +02:00
parent f0fee7e193
commit ab77fa3544
16 changed files with 1352 additions and 91 deletions
+144
View File
@@ -0,0 +1,144 @@
package hub
import (
"context"
"fmt"
"log/slog"
"time"
"gitea.dooplex.hu/admin/felhom-agent/internal/proxmox"
)
// proxmoxReader is the read-only subset the collector needs. Signatures match the
// REAL internal/proxmox.Client surface (slice 1): the node is held by the Client
// (no per-call node arg), reads return values (not pointers), and the guest type is
// proxmox.Guest. (The task's sketch used node-arg/pointer/LXC shapes; adapted to
// the actual exports per its instruction — no proxmox changes were needed: ListLXC
// already carries status/maxmem/maxdisk, GuestConfig carries cores.)
type proxmoxReader interface {
Node() string
NodeStatus(ctx context.Context) (proxmox.NodeStatus, error)
ListLXC(ctx context.Context) ([]proxmox.Guest, error)
GuestConfig(ctx context.Context, vmid int) (proxmox.GuestConfig, error)
}
// Collector builds a HostReport from read-only sources. All deps are behind narrow
// interfaces for unit testing.
type Collector struct {
px proxmoxReader
cf CloudflaredProber
hostID string
agentVersion string
logger *slog.Logger
now func() time.Time
}
// NewCollector builds a collector. hostID echoes config.Hub.HostID; agentVersion is
// the binary version.
func NewCollector(px proxmoxReader, cf CloudflaredProber, hostID, agentVersion string, logger *slog.Logger) *Collector {
if logger == nil {
logger = slog.Default()
}
return &Collector{
px: px,
cf: cf,
hostID: hostID,
agentVersion: agentVersion,
logger: logger,
now: func() time.Time { return time.Now().UTC() },
}
}
// Collect builds the report. Best-effort liveness: a failed NodeStatus is a hard
// error (no useful report — the cycle skips the POST); a failed per-guest
// GuestConfig degrades that guest to status="unknown" without spec but still sends;
// a cloudflared probe failure yields status="unknown" and is never fatal.
func (c *Collector) Collect(ctx context.Context) (*HostReport, error) {
ns, err := c.px.NodeStatus(ctx)
if err != nil {
return nil, fmt.Errorf("hub: NodeStatus failed (no useful report): %w", err)
}
report := &HostReport{
HostID: c.hostID,
ReportedAt: c.now().Format(time.RFC3339),
AgentVersion: c.agentVersion,
Host: hostMetrics(c.px.Node(), ns),
Guests: c.collectGuests(ctx),
// Defined-but-empty this slice (slices 5/6). Non-nil so they marshal as [].
StorageTargets: []StorageTarget{},
Backups: []Backup{},
RestoreTests: []RestoreTest{},
PBSSnapshots: []PBSSnapshot{},
AuditTail: []AuditEntry{},
Cloudflared: Cloudflared{Status: c.cloudflaredStatus(ctx)},
}
return report, nil
}
func hostMetrics(node string, ns proxmox.NodeStatus) HostMetrics {
h := HostMetrics{
Node: node,
CPUPercent: ns.CPU * 100, // PVE cpu is a 0..1 fraction
MemoryTotalBytes: ns.Memory.Total,
MemoryUsedBytes: ns.Memory.Used,
DiskTotalBytes: ns.RootFS.Total,
DiskUsedBytes: ns.RootFS.Used,
LoadAvg: ns.LoadAvg,
UptimeSeconds: ns.Uptime,
}
h.MemoryPercent = percent(ns.Memory.Used, ns.Memory.Total)
h.DiskPercent = percent(ns.RootFS.Used, ns.RootFS.Total)
if h.LoadAvg == nil {
h.LoadAvg = []string{}
}
return h
}
func (c *Collector) collectGuests(ctx context.Context) []Guest {
lxc, err := c.px.ListLXC(ctx)
if err != nil {
// Not fatal: a report with no guest list still carries host liveness.
c.logger.Warn("hub: ListLXC failed; reporting no guests", "err", err)
return []Guest{}
}
guests := make([]Guest, 0, len(lxc))
for _, g := range lxc {
entry := Guest{VMID: g.VMID, Name: g.Name, Status: g.Status, ControllerVersion: ""}
// GuestConfig supplies cores; memory/disk come from the list entry (bytes).
cfg, err := c.px.GuestConfig(ctx, g.VMID)
if err != nil {
c.logger.Warn("hub: GuestConfig failed; guest degraded to unknown",
"vmid", g.VMID, "err", err)
entry.Status = "unknown"
entry.Spec = nil // omitted
} else {
entry.Spec = &GuestSpec{
Cores: cfg.Cores,
MemoryBytes: g.MaxMem,
DiskBytes: g.MaxDisk,
}
}
guests = append(guests, entry)
}
return guests
}
func (c *Collector) cloudflaredStatus(ctx context.Context) string {
if c.cf == nil {
return "unknown"
}
st, err := c.cf.Status(ctx)
if err != nil || st == "" {
c.logger.Warn("hub: cloudflared probe failed", "err", err)
return "unknown"
}
return st
}
func percent(used, total int64) float64 {
if total <= 0 {
return 0
}
return float64(used) / float64(total) * 100
}