ab77fa3544
internal/hub: the agent's first daemon — a periodic read-only host-report POSTed to the hub (the heartbeat; no separate ping). - HostReport wire contract (shared field-for-field with the hub ingest): host metrics, guests (vmid + spec), cloudflared status; storage/backups/restore-tests/ pbs/audit collections DEFINED but emitted empty (slices 5/6 fill). - Collector over a read-only proxmoxReader (adapted to the real proxmox surface; no proxmox changes) + a CloudflaredProber. Partial-failure: NodeStatus fail = hard (skip POST); per-guest GuestConfig fail = status "unknown", still report. - Client: Bearer-auth POST, standard TLS (system roots / optional ca_file), typed TransportError/HTTPError, token never in errors. - Loop: immediate first report, adopt hub poll_interval (clamp [60,3600]), resilient to collect/report errors, clean ctx-cancel shutdown. - ControlEnvelope: only poll_interval_seconds acted on; blocked/desired_generation/ has_signed_ops parsed-but-ignored (slice 4). - config: HubConfig + FELHOM_AGENT_HUB_* overlay + mode-aware HubConfig.Validate + WithDefaults + hub-key redaction; example config updated. - main: no-selftest mode is now the daemon; added --selftest=hub. Version -> 0.3.0. Tests: report serialization, client (incl. token-redaction), collector partial- failure, loop continuation+interval adoption, config. internal/proxmox + internal/ authz untouched. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
145 lines
4.5 KiB
Go
145 lines
4.5 KiB
Go
package hub
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"time"
|
|
|
|
"gitea.dooplex.hu/admin/felhom-agent/internal/proxmox"
|
|
)
|
|
|
|
// proxmoxReader is the read-only subset the collector needs. Signatures match the
|
|
// REAL internal/proxmox.Client surface (slice 1): the node is held by the Client
|
|
// (no per-call node arg), reads return values (not pointers), and the guest type is
|
|
// proxmox.Guest. (The task's sketch used node-arg/pointer/LXC shapes; adapted to
|
|
// the actual exports per its instruction — no proxmox changes were needed: ListLXC
|
|
// already carries status/maxmem/maxdisk, GuestConfig carries cores.)
|
|
type proxmoxReader interface {
|
|
Node() string
|
|
NodeStatus(ctx context.Context) (proxmox.NodeStatus, error)
|
|
ListLXC(ctx context.Context) ([]proxmox.Guest, error)
|
|
GuestConfig(ctx context.Context, vmid int) (proxmox.GuestConfig, error)
|
|
}
|
|
|
|
// Collector builds a HostReport from read-only sources. All deps are behind narrow
|
|
// interfaces for unit testing.
|
|
type Collector struct {
|
|
px proxmoxReader
|
|
cf CloudflaredProber
|
|
hostID string
|
|
agentVersion string
|
|
logger *slog.Logger
|
|
now func() time.Time
|
|
}
|
|
|
|
// NewCollector builds a collector. hostID echoes config.Hub.HostID; agentVersion is
|
|
// the binary version.
|
|
func NewCollector(px proxmoxReader, cf CloudflaredProber, hostID, agentVersion string, logger *slog.Logger) *Collector {
|
|
if logger == nil {
|
|
logger = slog.Default()
|
|
}
|
|
return &Collector{
|
|
px: px,
|
|
cf: cf,
|
|
hostID: hostID,
|
|
agentVersion: agentVersion,
|
|
logger: logger,
|
|
now: func() time.Time { return time.Now().UTC() },
|
|
}
|
|
}
|
|
|
|
// Collect builds the report. Best-effort liveness: a failed NodeStatus is a hard
|
|
// error (no useful report — the cycle skips the POST); a failed per-guest
|
|
// GuestConfig degrades that guest to status="unknown" without spec but still sends;
|
|
// a cloudflared probe failure yields status="unknown" and is never fatal.
|
|
func (c *Collector) Collect(ctx context.Context) (*HostReport, error) {
|
|
ns, err := c.px.NodeStatus(ctx)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("hub: NodeStatus failed (no useful report): %w", err)
|
|
}
|
|
|
|
report := &HostReport{
|
|
HostID: c.hostID,
|
|
ReportedAt: c.now().Format(time.RFC3339),
|
|
AgentVersion: c.agentVersion,
|
|
Host: hostMetrics(c.px.Node(), ns),
|
|
Guests: c.collectGuests(ctx),
|
|
// Defined-but-empty this slice (slices 5/6). Non-nil so they marshal as [].
|
|
StorageTargets: []StorageTarget{},
|
|
Backups: []Backup{},
|
|
RestoreTests: []RestoreTest{},
|
|
PBSSnapshots: []PBSSnapshot{},
|
|
AuditTail: []AuditEntry{},
|
|
Cloudflared: Cloudflared{Status: c.cloudflaredStatus(ctx)},
|
|
}
|
|
return report, nil
|
|
}
|
|
|
|
func hostMetrics(node string, ns proxmox.NodeStatus) HostMetrics {
|
|
h := HostMetrics{
|
|
Node: node,
|
|
CPUPercent: ns.CPU * 100, // PVE cpu is a 0..1 fraction
|
|
MemoryTotalBytes: ns.Memory.Total,
|
|
MemoryUsedBytes: ns.Memory.Used,
|
|
DiskTotalBytes: ns.RootFS.Total,
|
|
DiskUsedBytes: ns.RootFS.Used,
|
|
LoadAvg: ns.LoadAvg,
|
|
UptimeSeconds: ns.Uptime,
|
|
}
|
|
h.MemoryPercent = percent(ns.Memory.Used, ns.Memory.Total)
|
|
h.DiskPercent = percent(ns.RootFS.Used, ns.RootFS.Total)
|
|
if h.LoadAvg == nil {
|
|
h.LoadAvg = []string{}
|
|
}
|
|
return h
|
|
}
|
|
|
|
func (c *Collector) collectGuests(ctx context.Context) []Guest {
|
|
lxc, err := c.px.ListLXC(ctx)
|
|
if err != nil {
|
|
// Not fatal: a report with no guest list still carries host liveness.
|
|
c.logger.Warn("hub: ListLXC failed; reporting no guests", "err", err)
|
|
return []Guest{}
|
|
}
|
|
guests := make([]Guest, 0, len(lxc))
|
|
for _, g := range lxc {
|
|
entry := Guest{VMID: g.VMID, Name: g.Name, Status: g.Status, ControllerVersion: ""}
|
|
// GuestConfig supplies cores; memory/disk come from the list entry (bytes).
|
|
cfg, err := c.px.GuestConfig(ctx, g.VMID)
|
|
if err != nil {
|
|
c.logger.Warn("hub: GuestConfig failed; guest degraded to unknown",
|
|
"vmid", g.VMID, "err", err)
|
|
entry.Status = "unknown"
|
|
entry.Spec = nil // omitted
|
|
} else {
|
|
entry.Spec = &GuestSpec{
|
|
Cores: cfg.Cores,
|
|
MemoryBytes: g.MaxMem,
|
|
DiskBytes: g.MaxDisk,
|
|
}
|
|
}
|
|
guests = append(guests, entry)
|
|
}
|
|
return guests
|
|
}
|
|
|
|
func (c *Collector) cloudflaredStatus(ctx context.Context) string {
|
|
if c.cf == nil {
|
|
return "unknown"
|
|
}
|
|
st, err := c.cf.Status(ctx)
|
|
if err != nil || st == "" {
|
|
c.logger.Warn("hub: cloudflared probe failed", "err", err)
|
|
return "unknown"
|
|
}
|
|
return st
|
|
}
|
|
|
|
func percent(used, total int64) float64 {
|
|
if total <= 0 {
|
|
return 0
|
|
}
|
|
return float64(used) / float64(total) * 100
|
|
}
|