Files
felhom-agent/internal/hub/collect.go
T
admin e68a7af4d3 fix(agent): slice-3 follow-ups — keep run-status on config fail, selftest usage, contract golden (v0.3.1)
- collect: a per-guest GuestConfig failure preserves the ListLXC run-status (only
  spec dropped); empty status normalized to "unknown". Test asserts preserved
  "running" + nil spec.
- main: --selftest usage error now reads (want read|task|hub).
- contract: testdata/host-report.golden.json + TestHostReport_ContractMatchesGolden
  (field-name key-set check vs golden; byte-identical with the hub copy).
- version 0.3.0 -> 0.3.1.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-08 18:29:05 +02:00

150 lines
4.7 KiB
Go

package hub
import (
"context"
"fmt"
"log/slog"
"time"
"gitea.dooplex.hu/admin/felhom-agent/internal/proxmox"
)
// proxmoxReader is the read-only subset the collector needs. Signatures match the
// REAL internal/proxmox.Client surface (slice 1): the node is held by the Client
// (no per-call node arg), reads return values (not pointers), and the guest type is
// proxmox.Guest. (The task's sketch used node-arg/pointer/LXC shapes; adapted to
// the actual exports per its instruction — no proxmox changes were needed: ListLXC
// already carries status/maxmem/maxdisk, GuestConfig carries cores.)
type proxmoxReader interface {
Node() string
NodeStatus(ctx context.Context) (proxmox.NodeStatus, error)
ListLXC(ctx context.Context) ([]proxmox.Guest, error)
GuestConfig(ctx context.Context, vmid int) (proxmox.GuestConfig, error)
}
// Collector builds a HostReport from read-only sources. All deps are behind narrow
// interfaces for unit testing.
type Collector struct {
px proxmoxReader
cf CloudflaredProber
hostID string
agentVersion string
logger *slog.Logger
now func() time.Time
}
// NewCollector builds a collector. hostID echoes config.Hub.HostID; agentVersion is
// the binary version.
func NewCollector(px proxmoxReader, cf CloudflaredProber, hostID, agentVersion string, logger *slog.Logger) *Collector {
if logger == nil {
logger = slog.Default()
}
return &Collector{
px: px,
cf: cf,
hostID: hostID,
agentVersion: agentVersion,
logger: logger,
now: func() time.Time { return time.Now().UTC() },
}
}
// Collect builds the report. Best-effort liveness: a failed NodeStatus is a hard
// error (no useful report — the cycle skips the POST); a failed per-guest
// GuestConfig degrades that guest to status="unknown" without spec but still sends;
// a cloudflared probe failure yields status="unknown" and is never fatal.
func (c *Collector) Collect(ctx context.Context) (*HostReport, error) {
ns, err := c.px.NodeStatus(ctx)
if err != nil {
return nil, fmt.Errorf("hub: NodeStatus failed (no useful report): %w", err)
}
report := &HostReport{
HostID: c.hostID,
ReportedAt: c.now().Format(time.RFC3339),
AgentVersion: c.agentVersion,
Host: hostMetrics(c.px.Node(), ns),
Guests: c.collectGuests(ctx),
// Defined-but-empty this slice (slices 5/6). Non-nil so they marshal as [].
StorageTargets: []StorageTarget{},
Backups: []Backup{},
RestoreTests: []RestoreTest{},
PBSSnapshots: []PBSSnapshot{},
AuditTail: []AuditEntry{},
Cloudflared: Cloudflared{Status: c.cloudflaredStatus(ctx)},
}
return report, nil
}
func hostMetrics(node string, ns proxmox.NodeStatus) HostMetrics {
h := HostMetrics{
Node: node,
CPUPercent: ns.CPU * 100, // PVE cpu is a 0..1 fraction
MemoryTotalBytes: ns.Memory.Total,
MemoryUsedBytes: ns.Memory.Used,
DiskTotalBytes: ns.RootFS.Total,
DiskUsedBytes: ns.RootFS.Used,
LoadAvg: ns.LoadAvg,
UptimeSeconds: ns.Uptime,
}
h.MemoryPercent = percent(ns.Memory.Used, ns.Memory.Total)
h.DiskPercent = percent(ns.RootFS.Used, ns.RootFS.Total)
if h.LoadAvg == nil {
h.LoadAvg = []string{}
}
return h
}
func (c *Collector) collectGuests(ctx context.Context) []Guest {
lxc, err := c.px.ListLXC(ctx)
if err != nil {
// Not fatal: a report with no guest list still carries host liveness.
c.logger.Warn("hub: ListLXC failed; reporting no guests", "err", err)
return []Guest{}
}
guests := make([]Guest, 0, len(lxc))
for _, g := range lxc {
entry := Guest{VMID: g.VMID, Name: g.Name, Status: g.Status, ControllerVersion: ""}
// Normalize an empty run-status to "unknown" so the wire value is always one
// of running|stopped|unknown (matches the hub handler's empty→unknown default).
if entry.Status == "" {
entry.Status = "unknown"
}
// GuestConfig supplies cores; memory/disk come from the list entry (bytes).
// On failure, KEEP the known run-status from ListLXC — only the spec is lost.
cfg, err := c.px.GuestConfig(ctx, g.VMID)
if err != nil {
c.logger.Warn("hub: GuestConfig failed; spec omitted (run-status kept)",
"vmid", g.VMID, "err", err)
entry.Spec = nil
} else {
entry.Spec = &GuestSpec{
Cores: cfg.Cores,
MemoryBytes: g.MaxMem,
DiskBytes: g.MaxDisk,
}
}
guests = append(guests, entry)
}
return guests
}
func (c *Collector) cloudflaredStatus(ctx context.Context) string {
if c.cf == nil {
return "unknown"
}
st, err := c.cf.Status(ctx)
if err != nil || st == "" {
c.logger.Warn("hub: cloudflared probe failed", "err", err)
return "unknown"
}
return st
}
func percent(used, total int64) float64 {
if total <= 0 {
return 0
}
return float64(used) / float64(total) * 100
}