feat(hub): host-report client + collector + first daemon loop (slice 3, v0.3.0)
internal/hub: the agent's first daemon — a periodic read-only host-report POSTed to the hub (the heartbeat; no separate ping). - HostReport wire contract (shared field-for-field with the hub ingest): host metrics, guests (vmid + spec), cloudflared status; storage/backups/restore-tests/ pbs/audit collections DEFINED but emitted empty (slices 5/6 fill). - Collector over a read-only proxmoxReader (adapted to the real proxmox surface; no proxmox changes) + a CloudflaredProber. Partial-failure: NodeStatus fail = hard (skip POST); per-guest GuestConfig fail = status "unknown", still report. - Client: Bearer-auth POST, standard TLS (system roots / optional ca_file), typed TransportError/HTTPError, token never in errors. - Loop: immediate first report, adopt hub poll_interval (clamp [60,3600]), resilient to collect/report errors, clean ctx-cancel shutdown. - ControlEnvelope: only poll_interval_seconds acted on; blocked/desired_generation/ has_signed_ops parsed-but-ignored (slice 4). - config: HubConfig + FELHOM_AGENT_HUB_* overlay + mode-aware HubConfig.Validate + WithDefaults + hub-key redaction; example config updated. - main: no-selftest mode is now the daemon; added --selftest=hub. Version -> 0.3.0. Tests: report serialization, client (incl. token-redaction), collector partial- failure, loop continuation+interval adoption, config. internal/proxmox + internal/ authz untouched. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,144 @@
|
||||
package hub
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-agent/internal/proxmox"
|
||||
)
|
||||
|
||||
// proxmoxReader is the read-only subset the collector needs. Signatures match the
|
||||
// REAL internal/proxmox.Client surface (slice 1): the node is held by the Client
|
||||
// (no per-call node arg), reads return values (not pointers), and the guest type is
|
||||
// proxmox.Guest. (The task's sketch used node-arg/pointer/LXC shapes; adapted to
|
||||
// the actual exports per its instruction — no proxmox changes were needed: ListLXC
|
||||
// already carries status/maxmem/maxdisk, GuestConfig carries cores.)
|
||||
type proxmoxReader interface {
|
||||
Node() string
|
||||
NodeStatus(ctx context.Context) (proxmox.NodeStatus, error)
|
||||
ListLXC(ctx context.Context) ([]proxmox.Guest, error)
|
||||
GuestConfig(ctx context.Context, vmid int) (proxmox.GuestConfig, error)
|
||||
}
|
||||
|
||||
// Collector builds a HostReport from read-only sources. All deps are behind narrow
|
||||
// interfaces for unit testing.
|
||||
type Collector struct {
|
||||
px proxmoxReader
|
||||
cf CloudflaredProber
|
||||
hostID string
|
||||
agentVersion string
|
||||
logger *slog.Logger
|
||||
now func() time.Time
|
||||
}
|
||||
|
||||
// NewCollector builds a collector. hostID echoes config.Hub.HostID; agentVersion is
|
||||
// the binary version.
|
||||
func NewCollector(px proxmoxReader, cf CloudflaredProber, hostID, agentVersion string, logger *slog.Logger) *Collector {
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
return &Collector{
|
||||
px: px,
|
||||
cf: cf,
|
||||
hostID: hostID,
|
||||
agentVersion: agentVersion,
|
||||
logger: logger,
|
||||
now: func() time.Time { return time.Now().UTC() },
|
||||
}
|
||||
}
|
||||
|
||||
// Collect builds the report. Best-effort liveness: a failed NodeStatus is a hard
|
||||
// error (no useful report — the cycle skips the POST); a failed per-guest
|
||||
// GuestConfig degrades that guest to status="unknown" without spec but still sends;
|
||||
// a cloudflared probe failure yields status="unknown" and is never fatal.
|
||||
func (c *Collector) Collect(ctx context.Context) (*HostReport, error) {
|
||||
ns, err := c.px.NodeStatus(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("hub: NodeStatus failed (no useful report): %w", err)
|
||||
}
|
||||
|
||||
report := &HostReport{
|
||||
HostID: c.hostID,
|
||||
ReportedAt: c.now().Format(time.RFC3339),
|
||||
AgentVersion: c.agentVersion,
|
||||
Host: hostMetrics(c.px.Node(), ns),
|
||||
Guests: c.collectGuests(ctx),
|
||||
// Defined-but-empty this slice (slices 5/6). Non-nil so they marshal as [].
|
||||
StorageTargets: []StorageTarget{},
|
||||
Backups: []Backup{},
|
||||
RestoreTests: []RestoreTest{},
|
||||
PBSSnapshots: []PBSSnapshot{},
|
||||
AuditTail: []AuditEntry{},
|
||||
Cloudflared: Cloudflared{Status: c.cloudflaredStatus(ctx)},
|
||||
}
|
||||
return report, nil
|
||||
}
|
||||
|
||||
func hostMetrics(node string, ns proxmox.NodeStatus) HostMetrics {
|
||||
h := HostMetrics{
|
||||
Node: node,
|
||||
CPUPercent: ns.CPU * 100, // PVE cpu is a 0..1 fraction
|
||||
MemoryTotalBytes: ns.Memory.Total,
|
||||
MemoryUsedBytes: ns.Memory.Used,
|
||||
DiskTotalBytes: ns.RootFS.Total,
|
||||
DiskUsedBytes: ns.RootFS.Used,
|
||||
LoadAvg: ns.LoadAvg,
|
||||
UptimeSeconds: ns.Uptime,
|
||||
}
|
||||
h.MemoryPercent = percent(ns.Memory.Used, ns.Memory.Total)
|
||||
h.DiskPercent = percent(ns.RootFS.Used, ns.RootFS.Total)
|
||||
if h.LoadAvg == nil {
|
||||
h.LoadAvg = []string{}
|
||||
}
|
||||
return h
|
||||
}
|
||||
|
||||
func (c *Collector) collectGuests(ctx context.Context) []Guest {
|
||||
lxc, err := c.px.ListLXC(ctx)
|
||||
if err != nil {
|
||||
// Not fatal: a report with no guest list still carries host liveness.
|
||||
c.logger.Warn("hub: ListLXC failed; reporting no guests", "err", err)
|
||||
return []Guest{}
|
||||
}
|
||||
guests := make([]Guest, 0, len(lxc))
|
||||
for _, g := range lxc {
|
||||
entry := Guest{VMID: g.VMID, Name: g.Name, Status: g.Status, ControllerVersion: ""}
|
||||
// GuestConfig supplies cores; memory/disk come from the list entry (bytes).
|
||||
cfg, err := c.px.GuestConfig(ctx, g.VMID)
|
||||
if err != nil {
|
||||
c.logger.Warn("hub: GuestConfig failed; guest degraded to unknown",
|
||||
"vmid", g.VMID, "err", err)
|
||||
entry.Status = "unknown"
|
||||
entry.Spec = nil // omitted
|
||||
} else {
|
||||
entry.Spec = &GuestSpec{
|
||||
Cores: cfg.Cores,
|
||||
MemoryBytes: g.MaxMem,
|
||||
DiskBytes: g.MaxDisk,
|
||||
}
|
||||
}
|
||||
guests = append(guests, entry)
|
||||
}
|
||||
return guests
|
||||
}
|
||||
|
||||
func (c *Collector) cloudflaredStatus(ctx context.Context) string {
|
||||
if c.cf == nil {
|
||||
return "unknown"
|
||||
}
|
||||
st, err := c.cf.Status(ctx)
|
||||
if err != nil || st == "" {
|
||||
c.logger.Warn("hub: cloudflared probe failed", "err", err)
|
||||
return "unknown"
|
||||
}
|
||||
return st
|
||||
}
|
||||
|
||||
func percent(used, total int64) float64 {
|
||||
if total <= 0 {
|
||||
return 0
|
||||
}
|
||||
return float64(used) / float64(total) * 100
|
||||
}
|
||||
Reference in New Issue
Block a user