felhom-agent/internal/reconcile/gate.go

package reconcile

import (
	"encoding/json"
	"log/slog"
	"reflect"
	"strconv"
	"time"

	"gitea.dooplex.hu/admin/felhom-agent/internal/authz"
)

// SourceKind records where an intent came from — audit/debug ONLY. Classification
// does NOT depend on it: a destructive desired-state delta and a destructive one-shot
// job are gated identically (the agent distrusts hub desired state for destructive
// change, not just jobs — doc 03 §4).
type SourceKind string

const (
	SourceDesiredDelta SourceKind = "desired_delta"
	SourceOneShotJob   SourceKind = "one_shot_job"
)

// Intent is an intended mutation presented to the gate. For benign reconcile actions
// the engine builds one per planned Action; destructive intents (jobs / deltas) carry
// their op class + canonical params for binding.
type Intent struct {
	Class   OpClass
	HostID  string
	GuestID string // blob-style guest id ("" = host-scoped); matches OpBlob.target.guest_id
	VMID    int    // numeric, for queue routing (0 = host-scoped)
	// ParamsJSON is the canonical params (matching the signed blob's `params`) used for
	// op-to-action binding on destructive ops. Nil for benign actions (not bound).
	ParamsJSON json.RawMessage
	// Provenance is AGENT-INTERNAL only (never hub-sourced) — see classify.go.
	Provenance Provenance
	Source     SourceKind
}

// SignedOp is the opaque operator-signed blob+signature pair the hub queues (doc 04
// §5). The agent never trusts it until authz.Verifier.Verify passes.
type SignedOp struct {
	Blob []byte // the canonical OpBlob JSON bytes (verified over RAW bytes)
	Sig  []byte // the armored SSHSIG
}

// RefuseReason is a stable, machine-readable gate refusal reason.
type RefuseReason string

const (
	ReasonBenign           RefuseReason = "benign"            // allowed, no signature needed
	ReasonSigned           RefuseReason = "signed"            // allowed by a verified op
	ReasonPendingSignature RefuseReason = "pending_signature" // destructive, no/again-needed signature
	ReasonRejected         RefuseReason = "rejected"          // signature failed authz verification
	ReasonRoleDenied       RefuseReason = "role_denied"       // signer role not authorized for this op class
	ReasonBindingMismatch  RefuseReason = "binding_mismatch"  // signature is for a different action
)

// Decision is the gate verdict.
type Decision struct {
	Allowed     bool
	Disposition Disposition
	Reason      RefuseReason
	// Verified is the authenticated op when a signature authorized the action.
	Verified *authz.VerifiedOp
	// Err is the underlying authz rejection (errors.Is-friendly: ErrUnknownSigner,
	// ErrExpired, ErrReplay, …) when Reason == ReasonRejected.
	Err error
}

// OpVerifier is the crypto verifier seam — *authz.Verifier in production; a fake in
// gate unit tests. The gate never re-implements any crypto; it only consumes the
// verdict and enforces the policy layer on top (role-scoping + op-to-action binding).
type OpVerifier interface {
	Verify(blob, sigArmored []byte) (*authz.VerifiedOp, error)
}

// AuditSink records every gate decision to the customer-visible audit log. Audit is a
// SIGNAL, never the guard (doc 03 §4 / doc 04 §5): a compromised hub could suppress a
// notice, which is exactly why the signature — not the audit — is the control.
type AuditSink interface {
	Record(rec AuditRecord)
}

// AuditRecord is one audited gate decision.
type AuditRecord struct {
	Time        time.Time
	Class       OpClass
	HostID      string
	GuestID     string
	Source      SourceKind
	Disposition Disposition
	Allowed     bool
	Reason      RefuseReason
	KeyID       string // matched signer's key id, when signed
	Nonce       string // the op nonce, when signed
}

// Gate is the reversibility gate: it sits in front of the per-guest queue's executor
// so EVERY mutation passes it. Benign intents are allowed unsigned; destructive
// intents require a verified, role-authorized, action-bound operator signature, else
// they are refused with pending_signature and never executed.
type Gate struct {
	verifier OpVerifier // may be nil (no signers pinned) → destructive is always pending_signature
	hostID   string
	audit    AuditSink
	logger   *slog.Logger
}

// NewGate builds a gate. verifier may be nil when no signers are configured (the
// common slice-4 state) — then there is nothing destructive to authorize and any
// destructive intent is refused pending_signature. audit/logger default to no-ops.
func NewGate(verifier OpVerifier, hostID string, audit AuditSink, logger *slog.Logger) *Gate {
	if audit == nil {
		audit = noopAudit{}
	}
	if logger == nil {
		logger = slog.New(slog.NewTextHandler(discard{}, nil))
	}
	return &Gate{verifier: verifier, hostID: hostID, audit: audit, logger: logger}
}

// Authorize classifies the intent and, for destructive intents, runs the full
// consuming-layer policy over the verifier verdict. It writes the decision to the
// audit log and returns it. It NEVER executes anything — the caller dispatches an
// Allowed decision onto the queue.
func (g *Gate) Authorize(intent Intent, signed *SignedOp) Decision {
	disp := Classify(intent.Class, intent.Provenance)

	// Benign: allowed without a signature.
	if disp == Benign {
		d := Decision{Allowed: true, Disposition: Benign, Reason: ReasonBenign}
		g.record(intent, d)
		return d
	}

	// Destructive from here: a verified, role-authorized, action-bound signature is
	// mandatory. Missing signature OR no pinned verifier → pending_signature (refuse).
	if signed == nil || g.verifier == nil {
		d := Decision{Allowed: false, Disposition: Destructive, Reason: ReasonPendingSignature}
		g.record(intent, d)
		return d
	}

	// Crypto + namespace + allow-list + target + time + nonce — the LOCKED authz
	// pipeline. The nonce is consumed (recorded) only if this passes.
	vop, err := g.verifier.Verify(signed.Blob, signed.Sig)
	if err != nil {
		d := Decision{Allowed: false, Disposition: Destructive, Reason: ReasonRejected, Err: err}
		g.record(intent, d)
		return d
	}

	// Role-scoping (the slice-4 job per verifier.go): the signer's pinned role must be
	// authorized for THIS op class.
	if !roleAuthorizes(vop.Signer.Role, intent.Class) {
		d := Decision{Allowed: false, Disposition: Destructive, Reason: ReasonRoleDenied, Verified: vop}
		g.record(intent, d)
		return d
	}

	// Op-to-action binding: the verified op must name THIS exact action (op + target +
	// params) — a signature for "restore guest X" cannot authorize destroying guest Y.
	if !g.bindsToAction(vop, intent) {
		d := Decision{Allowed: false, Disposition: Destructive, Reason: ReasonBindingMismatch, Verified: vop}
		g.record(intent, d)
		return d
	}

	d := Decision{Allowed: true, Disposition: Destructive, Reason: ReasonSigned, Verified: vop}
	g.record(intent, d)
	return d
}

// roleAuthorizes enforces the doc 04 §4 two-key role model: the cold recovery key
// authorizes ONLY key-rotation re-pins; the operational key authorizes ordinary
// destructive ops AND planned key-rotation.
func roleAuthorizes(role authz.KeyRole, class OpClass) bool {
	if class == ClassKeyRotation {
		return role == authz.RoleOperational || role == authz.RoleRecovery
	}
	return role == authz.RoleOperational
}

// bindsToAction checks the verified op names this exact action: host (already checked
// by the verifier, re-asserted here), guest, op class, and params. This is the binding
// BEYOND the verifier's target check (doc 04 §2.3 binds host; this binds the full
// action).
func (g *Gate) bindsToAction(vop *authz.VerifiedOp, intent Intent) bool {
	if vop.HostID != g.hostID || vop.HostID != intent.HostID {
		return false
	}
	if vop.GuestID != intent.GuestID {
		return false
	}
	if vop.Op != string(intent.Class) {
		return false
	}
	return paramsEqual(vop.Params, intent.ParamsJSON)
}

// paramsEqual compares two JSON param objects semantically (key order / whitespace
// independent). Absent params on both sides ({} or empty) compare equal.
func paramsEqual(a, b json.RawMessage) bool {
	ax, aok := decodeParams(a)
	bx, bok := decodeParams(b)
	if !aok || !bok {
		return false
	}
	return reflect.DeepEqual(ax, bx)
}

func decodeParams(p json.RawMessage) (any, bool) {
	if len(p) == 0 {
		return map[string]any{}, true // absent == empty object
	}
	var v any
	if err := json.Unmarshal(p, &v); err != nil {
		return nil, false
	}
	if v == nil {
		return map[string]any{}, true // explicit null == empty
	}
	return v, true
}

func (g *Gate) record(intent Intent, d Decision) {
	rec := AuditRecord{
		Time:        time.Now().UTC(),
		Class:       intent.Class,
		HostID:      intent.HostID,
		GuestID:     intent.GuestID,
		Source:      intent.Source,
		Disposition: d.Disposition,
		Allowed:     d.Allowed,
		Reason:      d.Reason,
	}
	if d.Verified != nil {
		rec.KeyID = d.Verified.Signer.KeyID
		rec.Nonce = d.Verified.Nonce
	}
	g.audit.Record(rec)
	g.logger.Info("gate decision",
		"class", intent.Class, "guest", intent.GuestID, "source", intent.Source,
		"disposition", d.Disposition, "allowed", d.Allowed, "reason", d.Reason)
}

// intentForAction builds the gate Intent for a benign reconcile action. The provenance
// is the zero value (no agent-internal destroy evidence) and the source is the
// desired-state delta — reconcile never fabricates scratch/same-txn provenance.
func intentForAction(hostID string, act Action) Intent {
	return Intent{
		Class:      classOfAction(act.Kind),
		HostID:     hostID,
		GuestID:    strconv.Itoa(act.VMID),
		VMID:       act.VMID,
		Provenance: Provenance{}, // benign actions need none; never hub-sourced
		Source:     SourceDesiredDelta,
	}
}

// noopAudit drops audit records (used when no sink is configured).
type noopAudit struct{}

func (noopAudit) Record(AuditRecord) {}

// SlogAudit is a minimal AuditSink that emits records to a logger. The durable,
// customer-visible audit log + its inclusion in the host-report (HostReport.AuditTail)
// is a later-slice concern; this keeps the signal flowing now without inventing that
// wire schema.
type SlogAudit struct{ Logger *slog.Logger }

// Record logs the audit entry at info level.
func (s SlogAudit) Record(rec AuditRecord) {
	if s.Logger == nil {
		return
	}
	s.Logger.Info("audit: gate decision",
		"class", rec.Class, "host", rec.HostID, "guest", rec.GuestID, "source", rec.Source,
		"disposition", rec.Disposition, "allowed", rec.Allowed, "reason", rec.Reason,
		"key_id", rec.KeyID, "nonce", auditNonce(rec.Nonce))
}

// auditNonce shortens a nonce for the log (full nonce is high-cardinality; a prefix is
// enough to correlate without bloating logs).
func auditNonce(n string) string {
	if len(n) <= 8 {
		return n
	}
	return n[:8] + "…"
}