feat(hub): host-report client + collector + first daemon loop (slice 3, v0.3.0)
internal/hub: the agent's first daemon — a periodic read-only host-report POSTed to the hub (the heartbeat; no separate ping). - HostReport wire contract (shared field-for-field with the hub ingest): host metrics, guests (vmid + spec), cloudflared status; storage/backups/restore-tests/ pbs/audit collections DEFINED but emitted empty (slices 5/6 fill). - Collector over a read-only proxmoxReader (adapted to the real proxmox surface; no proxmox changes) + a CloudflaredProber. Partial-failure: NodeStatus fail = hard (skip POST); per-guest GuestConfig fail = status "unknown", still report. - Client: Bearer-auth POST, standard TLS (system roots / optional ca_file), typed TransportError/HTTPError, token never in errors. - Loop: immediate first report, adopt hub poll_interval (clamp [60,3600]), resilient to collect/report errors, clean ctx-cancel shutdown. - ControlEnvelope: only poll_interval_seconds acted on; blocked/desired_generation/ has_signed_ops parsed-but-ignored (slice 4). - config: HubConfig + FELHOM_AGENT_HUB_* overlay + mode-aware HubConfig.Validate + WithDefaults + hub-key redaction; example config updated. - main: no-selftest mode is now the daemon; added --selftest=hub. Version -> 0.3.0. Tests: report serialization, client (incl. token-redaction), collector partial- failure, loop continuation+interval adoption, config. internal/proxmox + internal/ authz untouched. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,107 @@
|
||||
package hub
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log/slog"
|
||||
"time"
|
||||
)
|
||||
|
||||
// interval clamp bounds (locked decision 3).
|
||||
const (
|
||||
MinPollSeconds = 60
|
||||
MaxPollSeconds = 3600
|
||||
)
|
||||
|
||||
// reporter and collectorIface are the loop's deps as interfaces (tests inject fakes).
|
||||
type reporter interface {
|
||||
Report(ctx context.Context, r *HostReport) (*ControlEnvelope, error)
|
||||
}
|
||||
type collectorIface interface {
|
||||
Collect(ctx context.Context) (*HostReport, error)
|
||||
}
|
||||
|
||||
// Loop is the agent's first daemon run loop: collect a host-report, POST it, adopt
|
||||
// the hub's cadence, repeat. It is resilient — a collect or report error is logged
|
||||
// and the loop continues (the data plane is independent of the agent; a hub outage
|
||||
// must not kill it). There are NO Proxmox mutations here (read-only report), so no
|
||||
// per-guest work queue yet (that lands with reconcile, slice 4).
|
||||
type Loop struct {
|
||||
collector collectorIface
|
||||
client reporter
|
||||
interval time.Duration
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewLoop builds the loop. interval is the starting cadence (the hub may override it
|
||||
// per-cycle via the control envelope).
|
||||
func NewLoop(collector collectorIface, client reporter, interval time.Duration, logger *slog.Logger) *Loop {
|
||||
if logger == nil {
|
||||
logger = slog.Default()
|
||||
}
|
||||
return &Loop{collector: collector, client: client, interval: interval, logger: logger}
|
||||
}
|
||||
|
||||
// Run reports immediately, then on each tick, until ctx is cancelled (then nil).
|
||||
func (l *Loop) Run(ctx context.Context) error {
|
||||
interval := l.interval
|
||||
interval = l.cycle(ctx, interval) // immediate first report
|
||||
|
||||
ticker := time.NewTicker(interval)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
l.logger.Info("hub: loop shutting down", "reason", ctx.Err())
|
||||
return nil
|
||||
case <-ticker.C:
|
||||
next := l.cycle(ctx, interval)
|
||||
if next != interval {
|
||||
l.logger.Info("hub: poll interval changed", "from", interval, "to", next)
|
||||
interval = next
|
||||
ticker.Reset(interval)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// cycle runs one collect→report→adopt. It never returns an error: failures are
|
||||
// logged and the current interval is kept, so the loop keeps running.
|
||||
func (l *Loop) cycle(ctx context.Context, current time.Duration) time.Duration {
|
||||
report, err := l.collector.Collect(ctx)
|
||||
if err != nil {
|
||||
l.logger.Warn("hub: collect failed; skipping this cycle's report", "err", err)
|
||||
return current
|
||||
}
|
||||
env, err := l.client.Report(ctx, report)
|
||||
if err != nil {
|
||||
l.logger.Warn("hub: report failed; keeping current interval", "err", err)
|
||||
return current
|
||||
}
|
||||
l.logger.Debug("hub: report sent",
|
||||
"guests", len(report.Guests),
|
||||
// reserved/forward-compat envelope fields — logged only, never acted on (slice 4).
|
||||
"blocked", env.Blocked, "desired_generation", env.DesiredGeneration, "has_signed_ops", env.HasSignedOps)
|
||||
|
||||
if env.PollIntervalSeconds == nil {
|
||||
return current
|
||||
}
|
||||
d, clamped := clampInterval(*env.PollIntervalSeconds)
|
||||
if clamped {
|
||||
l.logger.Warn("hub: poll_interval_seconds out of range; clamped",
|
||||
"requested", *env.PollIntervalSeconds, "applied", int(d.Seconds()))
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
// clampInterval clamps a requested seconds value to [60,3600]; clamped reports
|
||||
// whether it was out of range.
|
||||
func clampInterval(sec int) (time.Duration, bool) {
|
||||
clamped := false
|
||||
if sec < MinPollSeconds {
|
||||
sec, clamped = MinPollSeconds, true
|
||||
}
|
||||
if sec > MaxPollSeconds {
|
||||
sec, clamped = MaxPollSeconds, true
|
||||
}
|
||||
return time.Duration(sec) * time.Second, clamped
|
||||
}
|
||||
Reference in New Issue
Block a user