// Package agentapi is the in-guest controller's client for the host agent's per-guest local // API (doc 03 §6, slice 8A). It reaches the agent over the bridge, pinning the agent's // self-signed leaf by SHA-256 (the same pin convention the agent uses for the Proxmox/PBS host // certs), and authenticates with the per-guest bearer token. In 8A it exercises GET /storage // (connectivity + the controller learning its mounts); the full surface (the /backup/due // quiesce loop) lands in 8B. package agentapi import ( "bytes" "context" "crypto/sha256" "crypto/tls" "crypto/x509" "encoding/hex" "encoding/json" "fmt" "io" "net/http" "strings" "time" ) // Client talks to one agent local-API endpoint with a pinned leaf + bearer token. type Client struct { baseURL string token string hc *http.Client } // MountInfo mirrors the agent's GET /storage mount entry (doc 03 §6). type MountInfo struct { Key string `json:"key"` Storage string `json:"storage"` MountPoint string `json:"mount_point"` Class string `json:"class"` // fast | slow | "" Backup bool `json:"backup"` } // StorageResponse mirrors the agent's GET /storage data payload. type StorageResponse struct { VMID int `json:"vmid"` Mounts []MountInfo `json:"mounts"` } // apiResponse is the agent's {ok,data,error} envelope. type apiResponse struct { OK bool `json:"ok"` Data json.RawMessage `json:"data"` Error string `json:"error"` } // New builds a pinned client for endpoint ("host:port") with the given per-guest token and the // agent leaf-cert SHA-256 fingerprint (hex, ':'-separators tolerated). The pin is the trust // anchor — the agent serves a self-signed cert, so chain verification is replaced by an exact // leaf-DER SHA-256 match (fails closed on any mismatch). func New(endpoint, token, fingerprintHex string) (*Client, error) { endpoint = strings.TrimSpace(endpoint) if endpoint == "" { return nil, fmt.Errorf("agentapi: endpoint required") } if token == "" { return nil, fmt.Errorf("agentapi: token required") } want, err := normalizeFingerprint(fingerprintHex) if err != nil { return nil, err } tlsCfg := &tls.Config{ InsecureSkipVerify: true, // self-signed leaf — the pin below is the real check VerifyPeerCertificate: func(rawCerts [][]byte, _ [][]*x509.Certificate) error { if len(rawCerts) == 0 { return fmt.Errorf("agentapi: TLS pin: peer presented no certificate") } got := sha256.Sum256(rawCerts[0]) // leaf DER if hex.EncodeToString(got[:]) != want { return fmt.Errorf("agentapi: TLS pin mismatch: agent leaf SHA-256 does not match the bootstrap fingerprint") } return nil }, MinVersion: tls.VersionTLS12, } return &Client{ baseURL: "https://" + endpoint, token: token, hc: &http.Client{ Timeout: 15 * time.Second, Transport: &http.Transport{TLSClientConfig: tlsCfg}, }, }, nil } // Storage calls GET /storage and returns this guest's mounts (connectivity + placement view). func (c *Client) Storage(ctx context.Context) (StorageResponse, error) { var out StorageResponse body, err := c.get(ctx, "/storage") if err != nil { return out, err } if err := json.Unmarshal(body, &out); err != nil { return out, fmt.Errorf("agentapi: decode /storage: %w", err) } return out, nil } // ---- slice 8B: app-consistent backup (quiesce loop) ------------------------------------- // DueResponse mirrors the agent's GET /backup/due payload. type DueResponse struct { VMID int `json:"vmid"` Due bool `json:"due"` Reason string `json:"reason"` } // BackupResponse mirrors the agent's POST /backup payload. type BackupResponse struct { VMID int `json:"vmid"` JobID string `json:"job_id"` Phase string `json:"phase"` } // StatusResponse mirrors the agent's GET /backup/status payload. type StatusResponse struct { VMID int `json:"vmid"` Phase string `json:"phase"` // idle | running | done | failed JobID string `json:"job_id"` Error string `json:"error"` } // Backup status phases (mirror the agent's vocabulary). const ( PhaseIdle = "idle" PhaseRunning = "running" PhaseDone = "done" PhaseFailed = "failed" ) // BackupDue reports whether a policy-scheduled backup is due for this guest (the quiesce trigger). func (c *Client) BackupDue(ctx context.Context) (DueResponse, error) { var out DueResponse body, err := c.get(ctx, "/backup/due") if err != nil { return out, err } if err := json.Unmarshal(body, &out); err != nil { return out, fmt.Errorf("agentapi: decode /backup/due: %w", err) } return out, nil } // StartBackup enqueues a backup of this guest (the agent vzdump) and returns the job to poll. func (c *Client) StartBackup(ctx context.Context) (BackupResponse, error) { var out BackupResponse body, err := c.post(ctx, "/backup", struct{}{}) if err != nil { return out, err } if err := json.Unmarshal(body, &out); err != nil { return out, fmt.Errorf("agentapi: decode POST /backup: %w", err) } return out, nil } // BackupStatus reports the current/last backup job phase for this guest. func (c *Client) BackupStatus(ctx context.Context) (StatusResponse, error) { var out StatusResponse body, err := c.get(ctx, "/backup/status") if err != nil { return out, err } if err := json.Unmarshal(body, &out); err != nil { return out, fmt.Errorf("agentapi: decode /backup/status: %w", err) } return out, nil } // ---- slice 8C: disk management (execution is the agent's) -------------------------------- // DiskInfo mirrors the agent's GET /disks entry. type DiskInfo struct { Name string `json:"name"` Type string `json:"type"` State string `json:"state"` BackingDevice string `json:"backing_device"` MountPath string `json:"mount_path"` Class string `json:"class"` // Role is the agent's AUTHORITATIVE protection tier: "system" | "backup" | "user-data". The UI // is driven from it — system/backup get a lock badge and NO destructive controls; user-data is // customer-manageable (eject/wipe with type-to-confirm). Role string `json:"role"` DataBearing bool `json:"data_bearing"` DataReason string `json:"data_reason"` TotalBytes int64 `json:"total_bytes"` UsedBytes int64 `json:"used_bytes"` UsedFraction float64 `json:"used_fraction"` // DurableID is the target's stable identity (e.g. "uuid:" for usb/local-dir). The // fs UUID (strip the "uuid:" prefix) is the key the controller passes to AssignDisk — it's the // only way the de-privileged controller learns a mount key it cannot read off the device itself. DurableID string `json:"durable_id"` } // FSUUID returns the raw filesystem UUID from a "uuid:<…>" DurableID, or "" if this disk's identity // is not a filesystem UUID (network/lvm targets — not assignable as a host mount). func (d DiskInfo) FSUUID() string { if rest, ok := strings.CutPrefix(d.DurableID, "uuid:"); ok { return rest } return "" } // DisksResponse mirrors GET /disks. type DisksResponse struct { VMID int `json:"vmid"` Disks []DiskInfo `json:"disks"` } // FormatResult mirrors POST /disks/format (the success/refusal payload). type FormatResult struct { VMID int `json:"vmid"` Device string `json:"device"` Formatted bool `json:"formatted"` DataBearing bool `json:"data_bearing"` Reason string `json:"reason"` // Role is the agent's tier for this device (system | backup | user-data). Role string `json:"role,omitempty"` // NeedsConfirmation is set on a USER-DATA data-bearing refusal: re-submit with confirmed=true + // DurableID after the type-to-confirm UI (NOT an operator signature). NeedsConfirmation bool `json:"needs_confirmation,omitempty"` DurableID string `json:"durable_id,omitempty"` // PendingOp is set on a SYSTEM/BACKUP data-bearing refusal — the operator-signature op. PendingOp *PendingOp `json:"pending_op,omitempty"` } // PendingOp mirrors the agent's bound destructive intent on a data-bearing refusal. The controller // surfaces the exact `felhom-opsign` command from it — it CANNOT complete a destructive format itself. type PendingOp struct { Op string `json:"op"` // e.g. "storage_wipe" HostScope string `json:"host_scope"` // the agent's host id (anti-retarget) DurableID string `json:"durable_id"` // byid:…|byuuid:… — the device's stable identity FSType string `json:"fstype"` // the filesystem to mkfs after the wipe } // OpsignCommand returns the literal command the operator must run offline to authorize the wipe. func (p PendingOp) OpsignCommand() string { return fmt.Sprintf("felhom-opsign -op %s -host %s -durable-id %s", p.Op, p.HostScope, p.DurableID) } // ErrFormatRefused is returned by FormatDisk when the agent refuses a data-bearing format on a // SYSTEM/BACKUP device (operator signature required). The UI surfaces the pending opsign command. var ErrFormatRefused = fmt.Errorf("agentapi: format refused — system/backup device (operator authorization required)") // ErrNeedsConfirmation is returned by FormatDisk when the agent refuses a data-bearing format on a // USER-DATA device pending the CUSTOMER's informed-confirmation (bound to FormatResult.DurableID). // The UI surfaces the type-to-confirm flow, then re-submits with confirmed=true + that durable id. var ErrNeedsConfirmation = fmt.Errorf("agentapi: format needs customer confirmation — user-data device") // Disks lists the host drives the agent manages, with a data-bearing flag per drive. func (c *Client) Disks(ctx context.Context) (DisksResponse, error) { var out DisksResponse body, err := c.get(ctx, "/disks") if err != nil { return out, err } if err := json.Unmarshal(body, &out); err != nil { return out, fmt.Errorf("agentapi: decode /disks: %w", err) } return out, nil } // AssignDisk attaches a drive (by fs-UUID) as a host mount (benign, self-serve). func (c *Client) AssignDisk(ctx context.Context, uuid, where, fstype, options string) error { _, err := c.post(ctx, "/disks/assign", map[string]string{ "uuid": uuid, "where": where, "fstype": fstype, "options": options, }) return err } // EjectResult mirrors POST /disks/eject (the dependent-guest warning). type EjectResult struct { VMID int `json:"vmid"` Ejected string `json:"ejected"` DependentGuests []int `json:"dependent_guests"` } // EjectDisk safe-unmounts a host mount (data preserved) and returns the dependent guests. func (c *Client) EjectDisk(ctx context.Context, where string) (EjectResult, error) { var out EjectResult body, err := c.post(ctx, "/disks/eject", map[string]string{"where": where}) if err != nil { return out, err } if err := json.Unmarshal(body, &out); err != nil { return out, fmt.Errorf("agentapi: decode /disks/eject: %w", err) } return out, nil } // FormatDisk asks the agent to format a device. The AGENT inspects the device and tiers it by ROLE // (its own classification, never the controller's claim): // - blank device → formatted. // - user-data, data-bearing, NOT confirmed → ErrNeedsConfirmation (out.DurableID = the id to confirm). // - user-data, data-bearing, confirmed + matching durable id → formatted. // - system/backup, data-bearing → ErrFormatRefused (out.PendingOp = the operator opsign command). // // confirmed + durableID authorize a user-data wipe (the durable id the agent gave on the prior // ErrNeedsConfirmation); they are inert for system/backup. func (c *Client) FormatDisk(ctx context.Context, device, fstype string, confirmed bool, durableID string) (FormatResult, error) { var out FormatResult // Status-aware POST: the agent returns the FULL FormatResponse (incl. pending_op / durable_id) // even on the 403 refusal, so we must read the body on non-2xx rather than discarding it. data, status, err := c.postWithStatus(ctx, "/disks/format", map[string]any{ "device": device, "fstype": fstype, "confirmed": confirmed, "durable_id": durableID, }) if err != nil { return out, err } // data is the envelope's {data:…} payload (present on both success and the 403 refusal). if len(data) > 0 { _ = json.Unmarshal(data, &out) // best-effort; fields default on a missing/partial body } if out.Formatted { return out, nil } if out.NeedsConfirmation { out.DataBearing = true return out, ErrNeedsConfirmation // user-data: surface the type-to-confirm flow } if status == http.StatusForbidden || (out.DataBearing && !out.Formatted) { out.DataBearing = true return out, ErrFormatRefused // system/backup: surface the opsign command } return out, nil } // postWithStatus issues an authenticated JSON POST and returns the envelope's data payload + the HTTP // status, even on a non-2xx (so callers like FormatDisk can read a 403 refusal body). A transport or // envelope-parse failure is still an error; an `ok:false` business refusal is NOT (the data carries it). func (c *Client) postWithStatus(ctx context.Context, path string, body any) (json.RawMessage, int, error) { buf, err := json.Marshal(body) if err != nil { return nil, 0, err } req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(buf)) if err != nil { return nil, 0, err } req.Header.Set("Authorization", "Bearer "+c.token) req.Header.Set("Content-Type", "application/json") resp, err := c.hc.Do(req) if err != nil { return nil, 0, fmt.Errorf("agentapi: POST %s: %w", path, err) } defer resp.Body.Close() raw, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) var env apiResponse if err := json.Unmarshal(raw, &env); err != nil { return nil, resp.StatusCode, fmt.Errorf("agentapi: POST %s: HTTP %d, bad envelope: %w", path, resp.StatusCode, err) } return env.Data, resp.StatusCode, nil } // ---- slice 9: host metrics (the customer host-health view) ------------------------------- // HostMetrics mirrors the agent's GET /host/metrics `host` block (shared HostMetrics wire shape). // CPUTempC is a pointer so a host with no temp sensor is null ("n/a"), distinct from a real 0. type HostMetrics struct { Node string `json:"node"` CPUPercent float64 `json:"cpu_percent"` // 0–100 MemoryTotalBytes int64 `json:"memory_total_bytes"` MemoryUsedBytes int64 `json:"memory_used_bytes"` MemoryPercent float64 `json:"memory_percent"` DiskTotalBytes int64 `json:"disk_total_bytes"` // host root fs DiskUsedBytes int64 `json:"disk_used_bytes"` DiskPercent float64 `json:"disk_percent"` LoadAvg []string `json:"loadavg"` UptimeSeconds int64 `json:"uptime_seconds"` CPUTempC *int `json:"cpu_temp_c"` // °C or null ("n/a") } // ThinPoolFill mirrors the agent's lvmthin pool fill (a full thin-pool corrupts every guest on it). type ThinPoolFill struct { DataUsedFraction float64 `json:"data_used_fraction"` MetadataUsedFraction *float64 `json:"metadata_used_fraction"` } // SmartSummary mirrors the agent's per-disk SMART health (only the fields the UI renders). Pointers // are null when the device type does not expose that attribute. type SmartSummary struct { Health string `json:"health"` // PASSED | FAILING | UNKNOWN TemperatureC *int `json:"temperature_c"` PercentageUsed *int `json:"percentage_used"` // NVMe wear (%); null for SATA/USB } // StorageTarget mirrors the agent's GET /host/metrics storage_targets entry (the per-storage // capacity + health the monitoring view renders). It is a SUBSET of the agent's wire shape — only // the fields the UI reads; unknown JSON keys are ignored. type StorageTarget struct { Name string `json:"name"` Type string `json:"type"` State string `json:"state"` Reachable bool `json:"reachable"` TotalBytes int64 `json:"total_bytes"` UsedBytes int64 `json:"used_bytes"` AvailBytes int64 `json:"avail_bytes"` UsedFraction float64 `json:"used_fraction"` Content string `json:"content"` MountPath string `json:"mount_path"` ClassHint string `json:"class_hint"` ThinPool *ThinPoolFill `json:"thin_pool,omitempty"` Smart SmartSummary `json:"smart"` } // HostMetricsResponse mirrors the agent's GET /host/metrics payload (host-wide health + per-storage // capacity). Host-wide and token-authed (one-customer-per-host); a fresh collect, not a snapshot. type HostMetricsResponse struct { VMID int `json:"vmid"` Host HostMetrics `json:"host"` StorageTargets []StorageTarget `json:"storage_targets"` } // HostMetrics calls GET /host/metrics and returns the host's live health + per-storage capacity. func (c *Client) HostMetrics(ctx context.Context) (HostMetricsResponse, error) { var out HostMetricsResponse body, err := c.get(ctx, "/host/metrics") if err != nil { return out, err } if err := json.Unmarshal(body, &out); err != nil { return out, fmt.Errorf("agentapi: decode /host/metrics: %w", err) } return out, nil } // get issues an authenticated GET and unwraps the {ok,data,error} envelope. func (c *Client) get(ctx context.Context, path string) (json.RawMessage, error) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil) if err != nil { return nil, err } req.Header.Set("Authorization", "Bearer "+c.token) resp, err := c.hc.Do(req) if err != nil { return nil, fmt.Errorf("agentapi: GET %s: %w", path, err) } defer resp.Body.Close() raw, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("agentapi: GET %s: HTTP %d", path, resp.StatusCode) } var env apiResponse if err := json.Unmarshal(raw, &env); err != nil { return nil, fmt.Errorf("agentapi: GET %s: bad envelope: %w", path, err) } if !env.OK { return nil, fmt.Errorf("agentapi: GET %s: %s", path, env.Error) } return env.Data, nil } // post issues an authenticated JSON POST and unwraps the {ok,data,error} envelope. The agent // returns 200 or 202 for accepted requests. func (c *Client) post(ctx context.Context, path string, body any) (json.RawMessage, error) { buf, err := json.Marshal(body) if err != nil { return nil, err } req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(buf)) if err != nil { return nil, err } req.Header.Set("Authorization", "Bearer "+c.token) req.Header.Set("Content-Type", "application/json") resp, err := c.hc.Do(req) if err != nil { return nil, fmt.Errorf("agentapi: POST %s: %w", path, err) } defer resp.Body.Close() raw, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusAccepted { return nil, fmt.Errorf("agentapi: POST %s: HTTP %d", path, resp.StatusCode) } var env apiResponse if err := json.Unmarshal(raw, &env); err != nil { return nil, fmt.Errorf("agentapi: POST %s: bad envelope: %w", path, err) } if !env.OK { return nil, fmt.Errorf("agentapi: POST %s: %s", path, env.Error) } return env.Data, nil } // normalizeFingerprint lowercases and strips ':'/' ' separators, requiring a 64-hex SHA-256. func normalizeFingerprint(fp string) (string, error) { s := strings.ToLower(strings.NewReplacer(":", "", " ", "", "\t", "").Replace(strings.TrimSpace(fp))) if len(s) != 64 { return "", fmt.Errorf("agentapi: fingerprint must be a SHA-256 (64 hex chars), got %d", len(s)) } if _, err := hex.DecodeString(s); err != nil { return "", fmt.Errorf("agentapi: fingerprint is not valid hex: %w", err) } return s, nil }