Files
felhom-controller/controller/internal/agentapi/client.go
T
admin 29a9dcdd8c v0.43.0: rebuilt storage management (guided init/attach/eject on agent disk model)
Controller-only UI/orchestration over the agent's disk endpoints + StoragePath
registry. New: storage overview (data_bearing badges), guided init (format ->
resolve fs UUID -> assign -> register; data-bearing REFUSAL surfaces the
felhom-opsign command, no force-format), guided attach, eject (+deregister,
dependent-guest warning). agentapi: DiskInfo.DurableID/FSUUID + FormatResult.
PendingOp (parsed from the 403). Honest buttons (migrate disabled, no 404s).
Phase 3: removed dead CrossDrive blocks in deploy.html/backups.html.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-11 19:47:58 +02:00

470 lines
17 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Package agentapi is the in-guest controller's client for the host agent's per-guest local
// API (doc 03 §6, slice 8A). It reaches the agent over the bridge, pinning the agent's
// self-signed leaf by SHA-256 (the same pin convention the agent uses for the Proxmox/PBS host
// certs), and authenticates with the per-guest bearer token. In 8A it exercises GET /storage
// (connectivity + the controller learning its mounts); the full surface (the /backup/due
// quiesce loop) lands in 8B.
package agentapi
import (
"bytes"
"context"
"crypto/sha256"
"crypto/tls"
"crypto/x509"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"time"
)
// Client talks to one agent local-API endpoint with a pinned leaf + bearer token.
type Client struct {
baseURL string
token string
hc *http.Client
}
// MountInfo mirrors the agent's GET /storage mount entry (doc 03 §6).
type MountInfo struct {
Key string `json:"key"`
Storage string `json:"storage"`
MountPoint string `json:"mount_point"`
Class string `json:"class"` // fast | slow | ""
Backup bool `json:"backup"`
}
// StorageResponse mirrors the agent's GET /storage data payload.
type StorageResponse struct {
VMID int `json:"vmid"`
Mounts []MountInfo `json:"mounts"`
}
// apiResponse is the agent's {ok,data,error} envelope.
type apiResponse struct {
OK bool `json:"ok"`
Data json.RawMessage `json:"data"`
Error string `json:"error"`
}
// New builds a pinned client for endpoint ("host:port") with the given per-guest token and the
// agent leaf-cert SHA-256 fingerprint (hex, ':'-separators tolerated). The pin is the trust
// anchor — the agent serves a self-signed cert, so chain verification is replaced by an exact
// leaf-DER SHA-256 match (fails closed on any mismatch).
func New(endpoint, token, fingerprintHex string) (*Client, error) {
endpoint = strings.TrimSpace(endpoint)
if endpoint == "" {
return nil, fmt.Errorf("agentapi: endpoint required")
}
if token == "" {
return nil, fmt.Errorf("agentapi: token required")
}
want, err := normalizeFingerprint(fingerprintHex)
if err != nil {
return nil, err
}
tlsCfg := &tls.Config{
InsecureSkipVerify: true, // self-signed leaf — the pin below is the real check
VerifyPeerCertificate: func(rawCerts [][]byte, _ [][]*x509.Certificate) error {
if len(rawCerts) == 0 {
return fmt.Errorf("agentapi: TLS pin: peer presented no certificate")
}
got := sha256.Sum256(rawCerts[0]) // leaf DER
if hex.EncodeToString(got[:]) != want {
return fmt.Errorf("agentapi: TLS pin mismatch: agent leaf SHA-256 does not match the bootstrap fingerprint")
}
return nil
},
MinVersion: tls.VersionTLS12,
}
return &Client{
baseURL: "https://" + endpoint,
token: token,
hc: &http.Client{
Timeout: 15 * time.Second,
Transport: &http.Transport{TLSClientConfig: tlsCfg},
},
}, nil
}
// Storage calls GET /storage and returns this guest's mounts (connectivity + placement view).
func (c *Client) Storage(ctx context.Context) (StorageResponse, error) {
var out StorageResponse
body, err := c.get(ctx, "/storage")
if err != nil {
return out, err
}
if err := json.Unmarshal(body, &out); err != nil {
return out, fmt.Errorf("agentapi: decode /storage: %w", err)
}
return out, nil
}
// ---- slice 8B: app-consistent backup (quiesce loop) -------------------------------------
// DueResponse mirrors the agent's GET /backup/due payload.
type DueResponse struct {
VMID int `json:"vmid"`
Due bool `json:"due"`
Reason string `json:"reason"`
}
// BackupResponse mirrors the agent's POST /backup payload.
type BackupResponse struct {
VMID int `json:"vmid"`
JobID string `json:"job_id"`
Phase string `json:"phase"`
}
// StatusResponse mirrors the agent's GET /backup/status payload.
type StatusResponse struct {
VMID int `json:"vmid"`
Phase string `json:"phase"` // idle | running | done | failed
JobID string `json:"job_id"`
Error string `json:"error"`
}
// Backup status phases (mirror the agent's vocabulary).
const (
PhaseIdle = "idle"
PhaseRunning = "running"
PhaseDone = "done"
PhaseFailed = "failed"
)
// BackupDue reports whether a policy-scheduled backup is due for this guest (the quiesce trigger).
func (c *Client) BackupDue(ctx context.Context) (DueResponse, error) {
var out DueResponse
body, err := c.get(ctx, "/backup/due")
if err != nil {
return out, err
}
if err := json.Unmarshal(body, &out); err != nil {
return out, fmt.Errorf("agentapi: decode /backup/due: %w", err)
}
return out, nil
}
// StartBackup enqueues a backup of this guest (the agent vzdump) and returns the job to poll.
func (c *Client) StartBackup(ctx context.Context) (BackupResponse, error) {
var out BackupResponse
body, err := c.post(ctx, "/backup", struct{}{})
if err != nil {
return out, err
}
if err := json.Unmarshal(body, &out); err != nil {
return out, fmt.Errorf("agentapi: decode POST /backup: %w", err)
}
return out, nil
}
// BackupStatus reports the current/last backup job phase for this guest.
func (c *Client) BackupStatus(ctx context.Context) (StatusResponse, error) {
var out StatusResponse
body, err := c.get(ctx, "/backup/status")
if err != nil {
return out, err
}
if err := json.Unmarshal(body, &out); err != nil {
return out, fmt.Errorf("agentapi: decode /backup/status: %w", err)
}
return out, nil
}
// ---- slice 8C: disk management (execution is the agent's) --------------------------------
// DiskInfo mirrors the agent's GET /disks entry.
type DiskInfo struct {
Name string `json:"name"`
Type string `json:"type"`
State string `json:"state"`
BackingDevice string `json:"backing_device"`
MountPath string `json:"mount_path"`
Class string `json:"class"`
DataBearing bool `json:"data_bearing"`
DataReason string `json:"data_reason"`
// DurableID is the target's stable identity (e.g. "uuid:<fs-uuid>" for usb/local-dir). The
// fs UUID (strip the "uuid:" prefix) is the key the controller passes to AssignDisk — it's the
// only way the de-privileged controller learns a mount key it cannot read off the device itself.
DurableID string `json:"durable_id"`
}
// FSUUID returns the raw filesystem UUID from a "uuid:<…>" DurableID, or "" if this disk's identity
// is not a filesystem UUID (network/lvm targets — not assignable as a host mount).
func (d DiskInfo) FSUUID() string {
if rest, ok := strings.CutPrefix(d.DurableID, "uuid:"); ok {
return rest
}
return ""
}
// DisksResponse mirrors GET /disks.
type DisksResponse struct {
VMID int `json:"vmid"`
Disks []DiskInfo `json:"disks"`
}
// FormatResult mirrors POST /disks/format (the success/refusal payload).
type FormatResult struct {
VMID int `json:"vmid"`
Device string `json:"device"`
Formatted bool `json:"formatted"`
DataBearing bool `json:"data_bearing"`
Reason string `json:"reason"`
PendingOp *PendingOp `json:"pending_op,omitempty"`
}
// PendingOp mirrors the agent's bound destructive intent on a data-bearing refusal. The controller
// surfaces the exact `felhom-opsign` command from it — it CANNOT complete a destructive format itself.
type PendingOp struct {
Op string `json:"op"` // e.g. "storage_wipe"
HostScope string `json:"host_scope"` // the agent's host id (anti-retarget)
DurableID string `json:"durable_id"` // byid:…|byuuid:… — the device's stable identity
FSType string `json:"fstype"` // the filesystem to mkfs after the wipe
}
// OpsignCommand returns the literal command the operator must run offline to authorize the wipe.
func (p PendingOp) OpsignCommand() string {
return fmt.Sprintf("felhom-opsign -op %s -host %s -durable-id %s", p.Op, p.HostScope, p.DurableID)
}
// ErrFormatRefused is returned by FormatDisk when the agent refuses a data-bearing format
// (pending operator authorization — the 8C invariant). The UI surfaces this distinctly.
var ErrFormatRefused = fmt.Errorf("agentapi: format refused — device is data-bearing (operator authorization required)")
// Disks lists the host drives the agent manages, with a data-bearing flag per drive.
func (c *Client) Disks(ctx context.Context) (DisksResponse, error) {
var out DisksResponse
body, err := c.get(ctx, "/disks")
if err != nil {
return out, err
}
if err := json.Unmarshal(body, &out); err != nil {
return out, fmt.Errorf("agentapi: decode /disks: %w", err)
}
return out, nil
}
// AssignDisk attaches a drive (by fs-UUID) as a host mount (benign, self-serve).
func (c *Client) AssignDisk(ctx context.Context, uuid, where, fstype, options string) error {
_, err := c.post(ctx, "/disks/assign", map[string]string{
"uuid": uuid, "where": where, "fstype": fstype, "options": options,
})
return err
}
// EjectResult mirrors POST /disks/eject (the dependent-guest warning).
type EjectResult struct {
VMID int `json:"vmid"`
Ejected string `json:"ejected"`
DependentGuests []int `json:"dependent_guests"`
}
// EjectDisk safe-unmounts a host mount (data preserved) and returns the dependent guests.
func (c *Client) EjectDisk(ctx context.Context, where string) (EjectResult, error) {
var out EjectResult
body, err := c.post(ctx, "/disks/eject", map[string]string{"where": where})
if err != nil {
return out, err
}
if err := json.Unmarshal(body, &out); err != nil {
return out, fmt.Errorf("agentapi: decode /disks/eject: %w", err)
}
return out, nil
}
// FormatDisk asks the agent to format a device. The AGENT inspects the device and decides
// data-bearing-ness — a data-bearing device is refused (ErrFormatRefused), the controller's claim
// is irrelevant. Only a device the agent reads as blank is formatted.
func (c *Client) FormatDisk(ctx context.Context, device, fstype string) (FormatResult, error) {
var out FormatResult
// Status-aware POST: the agent returns the FULL FormatResponse (incl. pending_op) even on the
// 403 refusal, so we must read the body on non-2xx rather than discarding it.
data, status, err := c.postWithStatus(ctx, "/disks/format", map[string]string{"device": device, "fstype": fstype})
if err != nil {
return out, err
}
// data is the envelope's {data:…} payload (present on both success and the 403 refusal).
if len(data) > 0 {
_ = json.Unmarshal(data, &out) // best-effort; fields default on a missing/partial body
}
if status == http.StatusForbidden || (out.DataBearing && !out.Formatted) {
out.DataBearing = true
out.Formatted = false
return out, ErrFormatRefused // carries PendingOp for the caller to surface the opsign command
}
return out, nil
}
// postWithStatus issues an authenticated JSON POST and returns the envelope's data payload + the HTTP
// status, even on a non-2xx (so callers like FormatDisk can read a 403 refusal body). A transport or
// envelope-parse failure is still an error; an `ok:false` business refusal is NOT (the data carries it).
func (c *Client) postWithStatus(ctx context.Context, path string, body any) (json.RawMessage, int, error) {
buf, err := json.Marshal(body)
if err != nil {
return nil, 0, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(buf))
if err != nil {
return nil, 0, err
}
req.Header.Set("Authorization", "Bearer "+c.token)
req.Header.Set("Content-Type", "application/json")
resp, err := c.hc.Do(req)
if err != nil {
return nil, 0, fmt.Errorf("agentapi: POST %s: %w", path, err)
}
defer resp.Body.Close()
raw, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
var env apiResponse
if err := json.Unmarshal(raw, &env); err != nil {
return nil, resp.StatusCode, fmt.Errorf("agentapi: POST %s: HTTP %d, bad envelope: %w", path, resp.StatusCode, err)
}
return env.Data, resp.StatusCode, nil
}
// ---- slice 9: host metrics (the customer host-health view) -------------------------------
// HostMetrics mirrors the agent's GET /host/metrics `host` block (shared HostMetrics wire shape).
// CPUTempC is a pointer so a host with no temp sensor is null ("n/a"), distinct from a real 0.
type HostMetrics struct {
Node string `json:"node"`
CPUPercent float64 `json:"cpu_percent"` // 0100
MemoryTotalBytes int64 `json:"memory_total_bytes"`
MemoryUsedBytes int64 `json:"memory_used_bytes"`
MemoryPercent float64 `json:"memory_percent"`
DiskTotalBytes int64 `json:"disk_total_bytes"` // host root fs
DiskUsedBytes int64 `json:"disk_used_bytes"`
DiskPercent float64 `json:"disk_percent"`
LoadAvg []string `json:"loadavg"`
UptimeSeconds int64 `json:"uptime_seconds"`
CPUTempC *int `json:"cpu_temp_c"` // °C or null ("n/a")
}
// ThinPoolFill mirrors the agent's lvmthin pool fill (a full thin-pool corrupts every guest on it).
type ThinPoolFill struct {
DataUsedFraction float64 `json:"data_used_fraction"`
MetadataUsedFraction *float64 `json:"metadata_used_fraction"`
}
// SmartSummary mirrors the agent's per-disk SMART health (only the fields the UI renders). Pointers
// are null when the device type does not expose that attribute.
type SmartSummary struct {
Health string `json:"health"` // PASSED | FAILING | UNKNOWN
TemperatureC *int `json:"temperature_c"`
PercentageUsed *int `json:"percentage_used"` // NVMe wear (%); null for SATA/USB
}
// StorageTarget mirrors the agent's GET /host/metrics storage_targets entry (the per-storage
// capacity + health the monitoring view renders). It is a SUBSET of the agent's wire shape — only
// the fields the UI reads; unknown JSON keys are ignored.
type StorageTarget struct {
Name string `json:"name"`
Type string `json:"type"`
State string `json:"state"`
Reachable bool `json:"reachable"`
TotalBytes int64 `json:"total_bytes"`
UsedBytes int64 `json:"used_bytes"`
AvailBytes int64 `json:"avail_bytes"`
UsedFraction float64 `json:"used_fraction"`
Content string `json:"content"`
MountPath string `json:"mount_path"`
ClassHint string `json:"class_hint"`
ThinPool *ThinPoolFill `json:"thin_pool,omitempty"`
Smart SmartSummary `json:"smart"`
}
// HostMetricsResponse mirrors the agent's GET /host/metrics payload (host-wide health + per-storage
// capacity). Host-wide and token-authed (one-customer-per-host); a fresh collect, not a snapshot.
type HostMetricsResponse struct {
VMID int `json:"vmid"`
Host HostMetrics `json:"host"`
StorageTargets []StorageTarget `json:"storage_targets"`
}
// HostMetrics calls GET /host/metrics and returns the host's live health + per-storage capacity.
func (c *Client) HostMetrics(ctx context.Context) (HostMetricsResponse, error) {
var out HostMetricsResponse
body, err := c.get(ctx, "/host/metrics")
if err != nil {
return out, err
}
if err := json.Unmarshal(body, &out); err != nil {
return out, fmt.Errorf("agentapi: decode /host/metrics: %w", err)
}
return out, nil
}
// get issues an authenticated GET and unwraps the {ok,data,error} envelope.
func (c *Client) get(ctx context.Context, path string) (json.RawMessage, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil)
if err != nil {
return nil, err
}
req.Header.Set("Authorization", "Bearer "+c.token)
resp, err := c.hc.Do(req)
if err != nil {
return nil, fmt.Errorf("agentapi: GET %s: %w", path, err)
}
defer resp.Body.Close()
raw, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("agentapi: GET %s: HTTP %d", path, resp.StatusCode)
}
var env apiResponse
if err := json.Unmarshal(raw, &env); err != nil {
return nil, fmt.Errorf("agentapi: GET %s: bad envelope: %w", path, err)
}
if !env.OK {
return nil, fmt.Errorf("agentapi: GET %s: %s", path, env.Error)
}
return env.Data, nil
}
// post issues an authenticated JSON POST and unwraps the {ok,data,error} envelope. The agent
// returns 200 or 202 for accepted requests.
func (c *Client) post(ctx context.Context, path string, body any) (json.RawMessage, error) {
buf, err := json.Marshal(body)
if err != nil {
return nil, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(buf))
if err != nil {
return nil, err
}
req.Header.Set("Authorization", "Bearer "+c.token)
req.Header.Set("Content-Type", "application/json")
resp, err := c.hc.Do(req)
if err != nil {
return nil, fmt.Errorf("agentapi: POST %s: %w", path, err)
}
defer resp.Body.Close()
raw, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusAccepted {
return nil, fmt.Errorf("agentapi: POST %s: HTTP %d", path, resp.StatusCode)
}
var env apiResponse
if err := json.Unmarshal(raw, &env); err != nil {
return nil, fmt.Errorf("agentapi: POST %s: bad envelope: %w", path, err)
}
if !env.OK {
return nil, fmt.Errorf("agentapi: POST %s: %s", path, env.Error)
}
return env.Data, nil
}
// normalizeFingerprint lowercases and strips ':'/' ' separators, requiring a 64-hex SHA-256.
func normalizeFingerprint(fp string) (string, error) {
s := strings.ToLower(strings.NewReplacer(":", "", " ", "", "\t", "").Replace(strings.TrimSpace(fp)))
if len(s) != 64 {
return "", fmt.Errorf("agentapi: fingerprint must be a SHA-256 (64 hex chars), got %d", len(s))
}
if _, err := hex.DecodeString(s); err != nil {
return "", fmt.Errorf("agentapi: fingerprint is not valid hex: %w", err)
}
return s, nil
}