ab77fa3544
internal/hub: the agent's first daemon — a periodic read-only host-report POSTed to the hub (the heartbeat; no separate ping). - HostReport wire contract (shared field-for-field with the hub ingest): host metrics, guests (vmid + spec), cloudflared status; storage/backups/restore-tests/ pbs/audit collections DEFINED but emitted empty (slices 5/6 fill). - Collector over a read-only proxmoxReader (adapted to the real proxmox surface; no proxmox changes) + a CloudflaredProber. Partial-failure: NodeStatus fail = hard (skip POST); per-guest GuestConfig fail = status "unknown", still report. - Client: Bearer-auth POST, standard TLS (system roots / optional ca_file), typed TransportError/HTTPError, token never in errors. - Loop: immediate first report, adopt hub poll_interval (clamp [60,3600]), resilient to collect/report errors, clean ctx-cancel shutdown. - ControlEnvelope: only poll_interval_seconds acted on; blocked/desired_generation/ has_signed_ops parsed-but-ignored (slice 4). - config: HubConfig + FELHOM_AGENT_HUB_* overlay + mode-aware HubConfig.Validate + WithDefaults + hub-key redaction; example config updated. - main: no-selftest mode is now the daemon; added --selftest=hub. Version -> 0.3.0. Tests: report serialization, client (incl. token-redaction), collector partial- failure, loop continuation+interval adoption, config. internal/proxmox + internal/ authz untouched. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
108 lines
3.3 KiB
Go
108 lines
3.3 KiB
Go
package hub
|
|
|
|
import (
|
|
"context"
|
|
"log/slog"
|
|
"time"
|
|
)
|
|
|
|
// interval clamp bounds (locked decision 3).
|
|
const (
|
|
MinPollSeconds = 60
|
|
MaxPollSeconds = 3600
|
|
)
|
|
|
|
// reporter and collectorIface are the loop's deps as interfaces (tests inject fakes).
|
|
type reporter interface {
|
|
Report(ctx context.Context, r *HostReport) (*ControlEnvelope, error)
|
|
}
|
|
type collectorIface interface {
|
|
Collect(ctx context.Context) (*HostReport, error)
|
|
}
|
|
|
|
// Loop is the agent's first daemon run loop: collect a host-report, POST it, adopt
|
|
// the hub's cadence, repeat. It is resilient — a collect or report error is logged
|
|
// and the loop continues (the data plane is independent of the agent; a hub outage
|
|
// must not kill it). There are NO Proxmox mutations here (read-only report), so no
|
|
// per-guest work queue yet (that lands with reconcile, slice 4).
|
|
type Loop struct {
|
|
collector collectorIface
|
|
client reporter
|
|
interval time.Duration
|
|
logger *slog.Logger
|
|
}
|
|
|
|
// NewLoop builds the loop. interval is the starting cadence (the hub may override it
|
|
// per-cycle via the control envelope).
|
|
func NewLoop(collector collectorIface, client reporter, interval time.Duration, logger *slog.Logger) *Loop {
|
|
if logger == nil {
|
|
logger = slog.Default()
|
|
}
|
|
return &Loop{collector: collector, client: client, interval: interval, logger: logger}
|
|
}
|
|
|
|
// Run reports immediately, then on each tick, until ctx is cancelled (then nil).
|
|
func (l *Loop) Run(ctx context.Context) error {
|
|
interval := l.interval
|
|
interval = l.cycle(ctx, interval) // immediate first report
|
|
|
|
ticker := time.NewTicker(interval)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
l.logger.Info("hub: loop shutting down", "reason", ctx.Err())
|
|
return nil
|
|
case <-ticker.C:
|
|
next := l.cycle(ctx, interval)
|
|
if next != interval {
|
|
l.logger.Info("hub: poll interval changed", "from", interval, "to", next)
|
|
interval = next
|
|
ticker.Reset(interval)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// cycle runs one collect→report→adopt. It never returns an error: failures are
|
|
// logged and the current interval is kept, so the loop keeps running.
|
|
func (l *Loop) cycle(ctx context.Context, current time.Duration) time.Duration {
|
|
report, err := l.collector.Collect(ctx)
|
|
if err != nil {
|
|
l.logger.Warn("hub: collect failed; skipping this cycle's report", "err", err)
|
|
return current
|
|
}
|
|
env, err := l.client.Report(ctx, report)
|
|
if err != nil {
|
|
l.logger.Warn("hub: report failed; keeping current interval", "err", err)
|
|
return current
|
|
}
|
|
l.logger.Debug("hub: report sent",
|
|
"guests", len(report.Guests),
|
|
// reserved/forward-compat envelope fields — logged only, never acted on (slice 4).
|
|
"blocked", env.Blocked, "desired_generation", env.DesiredGeneration, "has_signed_ops", env.HasSignedOps)
|
|
|
|
if env.PollIntervalSeconds == nil {
|
|
return current
|
|
}
|
|
d, clamped := clampInterval(*env.PollIntervalSeconds)
|
|
if clamped {
|
|
l.logger.Warn("hub: poll_interval_seconds out of range; clamped",
|
|
"requested", *env.PollIntervalSeconds, "applied", int(d.Seconds()))
|
|
}
|
|
return d
|
|
}
|
|
|
|
// clampInterval clamps a requested seconds value to [60,3600]; clamped reports
|
|
// whether it was out of range.
|
|
func clampInterval(sec int) (time.Duration, bool) {
|
|
clamped := false
|
|
if sec < MinPollSeconds {
|
|
sec, clamped = MinPollSeconds, true
|
|
}
|
|
if sec > MaxPollSeconds {
|
|
sec, clamped = MaxPollSeconds, true
|
|
}
|
|
return time.Duration(sec) * time.Second, clamped
|
|
}
|