Files
felhom-agent/internal/hub/loop.go
T
admin ab77fa3544 feat(hub): host-report client + collector + first daemon loop (slice 3, v0.3.0)
internal/hub: the agent's first daemon — a periodic read-only host-report POSTed to
the hub (the heartbeat; no separate ping).

- HostReport wire contract (shared field-for-field with the hub ingest): host
  metrics, guests (vmid + spec), cloudflared status; storage/backups/restore-tests/
  pbs/audit collections DEFINED but emitted empty (slices 5/6 fill).
- Collector over a read-only proxmoxReader (adapted to the real proxmox surface;
  no proxmox changes) + a CloudflaredProber. Partial-failure: NodeStatus fail = hard
  (skip POST); per-guest GuestConfig fail = status "unknown", still report.
- Client: Bearer-auth POST, standard TLS (system roots / optional ca_file), typed
  TransportError/HTTPError, token never in errors.
- Loop: immediate first report, adopt hub poll_interval (clamp [60,3600]), resilient
  to collect/report errors, clean ctx-cancel shutdown.
- ControlEnvelope: only poll_interval_seconds acted on; blocked/desired_generation/
  has_signed_ops parsed-but-ignored (slice 4).
- config: HubConfig + FELHOM_AGENT_HUB_* overlay + mode-aware HubConfig.Validate +
  WithDefaults + hub-key redaction; example config updated.
- main: no-selftest mode is now the daemon; added --selftest=hub. Version -> 0.3.0.

Tests: report serialization, client (incl. token-redaction), collector partial-
failure, loop continuation+interval adoption, config. internal/proxmox + internal/
authz untouched.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-08 16:20:09 +02:00

108 lines
3.3 KiB
Go

package hub
import (
"context"
"log/slog"
"time"
)
// interval clamp bounds (locked decision 3).
const (
MinPollSeconds = 60
MaxPollSeconds = 3600
)
// reporter and collectorIface are the loop's deps as interfaces (tests inject fakes).
type reporter interface {
Report(ctx context.Context, r *HostReport) (*ControlEnvelope, error)
}
type collectorIface interface {
Collect(ctx context.Context) (*HostReport, error)
}
// Loop is the agent's first daemon run loop: collect a host-report, POST it, adopt
// the hub's cadence, repeat. It is resilient — a collect or report error is logged
// and the loop continues (the data plane is independent of the agent; a hub outage
// must not kill it). There are NO Proxmox mutations here (read-only report), so no
// per-guest work queue yet (that lands with reconcile, slice 4).
type Loop struct {
collector collectorIface
client reporter
interval time.Duration
logger *slog.Logger
}
// NewLoop builds the loop. interval is the starting cadence (the hub may override it
// per-cycle via the control envelope).
func NewLoop(collector collectorIface, client reporter, interval time.Duration, logger *slog.Logger) *Loop {
if logger == nil {
logger = slog.Default()
}
return &Loop{collector: collector, client: client, interval: interval, logger: logger}
}
// Run reports immediately, then on each tick, until ctx is cancelled (then nil).
func (l *Loop) Run(ctx context.Context) error {
interval := l.interval
interval = l.cycle(ctx, interval) // immediate first report
ticker := time.NewTicker(interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
l.logger.Info("hub: loop shutting down", "reason", ctx.Err())
return nil
case <-ticker.C:
next := l.cycle(ctx, interval)
if next != interval {
l.logger.Info("hub: poll interval changed", "from", interval, "to", next)
interval = next
ticker.Reset(interval)
}
}
}
}
// cycle runs one collect→report→adopt. It never returns an error: failures are
// logged and the current interval is kept, so the loop keeps running.
func (l *Loop) cycle(ctx context.Context, current time.Duration) time.Duration {
report, err := l.collector.Collect(ctx)
if err != nil {
l.logger.Warn("hub: collect failed; skipping this cycle's report", "err", err)
return current
}
env, err := l.client.Report(ctx, report)
if err != nil {
l.logger.Warn("hub: report failed; keeping current interval", "err", err)
return current
}
l.logger.Debug("hub: report sent",
"guests", len(report.Guests),
// reserved/forward-compat envelope fields — logged only, never acted on (slice 4).
"blocked", env.Blocked, "desired_generation", env.DesiredGeneration, "has_signed_ops", env.HasSignedOps)
if env.PollIntervalSeconds == nil {
return current
}
d, clamped := clampInterval(*env.PollIntervalSeconds)
if clamped {
l.logger.Warn("hub: poll_interval_seconds out of range; clamped",
"requested", *env.PollIntervalSeconds, "applied", int(d.Seconds()))
}
return d
}
// clampInterval clamps a requested seconds value to [60,3600]; clamped reports
// whether it was out of range.
func clampInterval(sec int) (time.Duration, bool) {
clamped := false
if sec < MinPollSeconds {
sec, clamped = MinPollSeconds, true
}
if sec > MaxPollSeconds {
sec, clamped = MaxPollSeconds, true
}
return time.Duration(sec) * time.Second, clamped
}