Files
felhom-controller/controller/internal/agentapi/client.go
T
admin 13c6a0929a v0.57.0: stable host-storage list + per-app Tier-2 config panel
Part A of the UI-fixes/storage-spike spec.

A1: enrichHostStorageTargets sorts /api/host-metrics storage_targets
server-side and attaches friendly Hungarian labels + purpose, fixing the
#host-storage-bars reorder-on-poll bug. Display labels only — PVE storage
ids are never renamed.

A2: new GET/POST /stacks/{name}/backup Tier-2 config panel; the "2. mentés"
Beállítás button is repointed there from the dead-end deploy page. Customer
can pin a target drive or disable Tier 2; preference is preserved across the
runner's status writes. Always visible (single-SSD + non-HDD apps included).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-13 14:23:34 +02:00

573 lines
23 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Package agentapi is the in-guest controller's client for the host agent's per-guest local
// API (doc 03 §6, slice 8A). It reaches the agent over the bridge, pinning the agent's
// self-signed leaf by SHA-256 (the same pin convention the agent uses for the Proxmox/PBS host
// certs), and authenticates with the per-guest bearer token. In 8A it exercises GET /storage
// (connectivity + the controller learning its mounts); the full surface (the /backup/due
// quiesce loop) lands in 8B.
package agentapi
import (
"bytes"
"context"
"crypto/sha256"
"crypto/tls"
"crypto/x509"
"encoding/hex"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"time"
)
// Client talks to one agent local-API endpoint with a pinned leaf + bearer token.
type Client struct {
baseURL string
token string
hc *http.Client
}
// MountInfo mirrors the agent's GET /storage mount entry (doc 03 §6).
type MountInfo struct {
Key string `json:"key"`
Storage string `json:"storage"`
MountPoint string `json:"mount_point"`
Class string `json:"class"` // fast | slow | ""
Backup bool `json:"backup"`
}
// StorageResponse mirrors the agent's GET /storage data payload.
type StorageResponse struct {
VMID int `json:"vmid"`
Mounts []MountInfo `json:"mounts"`
}
// apiResponse is the agent's {ok,data,error} envelope.
type apiResponse struct {
OK bool `json:"ok"`
Data json.RawMessage `json:"data"`
Error string `json:"error"`
}
// New builds a pinned client for endpoint ("host:port") with the given per-guest token and the
// agent leaf-cert SHA-256 fingerprint (hex, ':'-separators tolerated). The pin is the trust
// anchor — the agent serves a self-signed cert, so chain verification is replaced by an exact
// leaf-DER SHA-256 match (fails closed on any mismatch).
func New(endpoint, token, fingerprintHex string) (*Client, error) {
endpoint = strings.TrimSpace(endpoint)
if endpoint == "" {
return nil, fmt.Errorf("agentapi: endpoint required")
}
if token == "" {
return nil, fmt.Errorf("agentapi: token required")
}
want, err := normalizeFingerprint(fingerprintHex)
if err != nil {
return nil, err
}
tlsCfg := &tls.Config{
InsecureSkipVerify: true, // self-signed leaf — the pin below is the real check
VerifyPeerCertificate: func(rawCerts [][]byte, _ [][]*x509.Certificate) error {
if len(rawCerts) == 0 {
return fmt.Errorf("agentapi: TLS pin: peer presented no certificate")
}
got := sha256.Sum256(rawCerts[0]) // leaf DER
if hex.EncodeToString(got[:]) != want {
return fmt.Errorf("agentapi: TLS pin mismatch: agent leaf SHA-256 does not match the bootstrap fingerprint")
}
return nil
},
MinVersion: tls.VersionTLS12,
}
return &Client{
baseURL: "https://" + endpoint,
token: token,
hc: &http.Client{
Timeout: 15 * time.Second,
Transport: &http.Transport{TLSClientConfig: tlsCfg},
},
}, nil
}
// Storage calls GET /storage and returns this guest's mounts (connectivity + placement view).
func (c *Client) Storage(ctx context.Context) (StorageResponse, error) {
var out StorageResponse
body, err := c.get(ctx, "/storage")
if err != nil {
return out, err
}
if err := json.Unmarshal(body, &out); err != nil {
return out, fmt.Errorf("agentapi: decode /storage: %w", err)
}
return out, nil
}
// ---- slice 8B: app-consistent backup (quiesce loop) -------------------------------------
// DueResponse mirrors the agent's GET /backup/due payload. AgeSecs is the age of the newest
// successful backup (nil when none has run yet).
type DueResponse struct {
VMID int `json:"vmid"`
Due bool `json:"due"`
Reason string `json:"reason"`
AgeSecs *int64 `json:"age_seconds"`
}
// BackupResponse mirrors the agent's POST /backup payload.
type BackupResponse struct {
VMID int `json:"vmid"`
JobID string `json:"job_id"`
Phase string `json:"phase"`
}
// StatusResponse mirrors the agent's GET /backup/status payload. Backup is the latest RECORDED
// whole-guest backup (nil until one has run), surfaced to the controller's backup page for visibility.
type StatusResponse struct {
VMID int `json:"vmid"`
Phase string `json:"phase"` // idle | running | snapshotted | done | failed
JobID string `json:"job_id"`
Error string `json:"error"`
Backup *BackupRecord `json:"backup,omitempty"`
}
// BackupRecord mirrors the agent's hub.Backup — one whole-guest vzdump/PBS backup result. The
// controller renders it read-only (it does NOT own whole-guest backup; the agent does).
type BackupRecord struct {
TargetID string `json:"target_id"` // backup storage name (e.g. "local", "felhom-pbs")
VMID int `json:"vmid"`
Archive string `json:"archive"` // produced vzdump volid (e.g. "local:backup/vzdump-lxc-…")
Mode string `json:"mode"` // snapshot | stop
CrashConsistent bool `json:"crash_consistent"`
SizeBytes int64 `json:"size_bytes"`
Success bool `json:"success"`
Error string `json:"error,omitempty"`
StartedAt string `json:"started_at"` // RFC3339
DurationSeconds float64 `json:"duration_seconds"`
}
// RestoreTestRecord mirrors the agent's hub.RestoreTest — the latest self-restore-test (the "backup
// verified restorable" trust signal). Nil when none has run yet.
type RestoreTestRecord struct {
SourceArchive string `json:"source_archive"`
SourceTier string `json:"source_tier"` // "local" (pbs = Phase B)
Pass bool `json:"pass"`
Verified string `json:"verified"` // "boot+running" this slice
Error string `json:"error,omitempty"`
TestedAt string `json:"tested_at"` // RFC3339
DurationSeconds float64 `json:"duration_seconds"`
Warnings []string `json:"warnings,omitempty"`
}
// Backup status phases (mirror the agent's vocabulary).
const (
PhaseIdle = "idle"
PhaseRunning = "running"
PhaseDone = "done"
PhaseFailed = "failed"
)
// BackupDue reports whether a policy-scheduled backup is due for this guest (the quiesce trigger).
func (c *Client) BackupDue(ctx context.Context) (DueResponse, error) {
var out DueResponse
body, err := c.get(ctx, "/backup/due")
if err != nil {
return out, err
}
if err := json.Unmarshal(body, &out); err != nil {
return out, fmt.Errorf("agentapi: decode /backup/due: %w", err)
}
return out, nil
}
// StartBackup enqueues a backup of this guest (the agent vzdump) and returns the job to poll.
func (c *Client) StartBackup(ctx context.Context) (BackupResponse, error) {
var out BackupResponse
body, err := c.post(ctx, "/backup", struct{}{})
if err != nil {
return out, err
}
if err := json.Unmarshal(body, &out); err != nil {
return out, fmt.Errorf("agentapi: decode POST /backup: %w", err)
}
return out, nil
}
// BackupStatus reports the current/last backup job phase for this guest.
func (c *Client) BackupStatus(ctx context.Context) (StatusResponse, error) {
var out StatusResponse
body, err := c.get(ctx, "/backup/status")
if err != nil {
return out, err
}
if err := json.Unmarshal(body, &out); err != nil {
return out, fmt.Errorf("agentapi: decode /backup/status: %w", err)
}
return out, nil
}
// RestoreTestStatus calls GET /restore-test/status and returns the latest self-restore-test result
// (nil when none has run yet — the agent payload is {"restore_test": {...}|null}).
func (c *Client) RestoreTestStatus(ctx context.Context) (*RestoreTestRecord, error) {
body, err := c.get(ctx, "/restore-test/status")
if err != nil {
return nil, err
}
var out struct {
RestoreTest *RestoreTestRecord `json:"restore_test"`
}
if err := json.Unmarshal(body, &out); err != nil {
return nil, fmt.Errorf("agentapi: decode /restore-test/status: %w", err)
}
return out.RestoreTest, nil
}
// ---- slice 8C: disk management (execution is the agent's) --------------------------------
// DiskInfo mirrors the agent's GET /disks entry.
type DiskInfo struct {
Name string `json:"name"`
Type string `json:"type"`
State string `json:"state"`
BackingDevice string `json:"backing_device"`
MountPath string `json:"mount_path"`
Class string `json:"class"`
// Role is the agent's AUTHORITATIVE protection tier: "system" | "backup" | "user-data". The UI
// is driven from it — system/backup get a lock badge and NO destructive controls; user-data is
// customer-manageable (eject/wipe with type-to-confirm).
Role string `json:"role"`
DataBearing bool `json:"data_bearing"`
DataReason string `json:"data_reason"`
TotalBytes int64 `json:"total_bytes"`
UsedBytes int64 `json:"used_bytes"`
UsedFraction float64 `json:"used_fraction"`
// DurableID is the target's stable identity (e.g. "uuid:<fs-uuid>" for usb/local-dir). The
// fs UUID (strip the "uuid:" prefix) is the key the controller passes to AssignDisk — it's the
// only way the de-privileged controller learns a mount key it cannot read off the device itself.
DurableID string `json:"durable_id"`
}
// FSUUID returns the raw filesystem UUID from a "uuid:<…>" DurableID, or "" if this disk's identity
// is not a filesystem UUID (network/lvm targets — not assignable as a host mount).
func (d DiskInfo) FSUUID() string {
if rest, ok := strings.CutPrefix(d.DurableID, "uuid:"); ok {
return rest
}
return ""
}
// DisksResponse mirrors GET /disks.
type DisksResponse struct {
VMID int `json:"vmid"`
Disks []DiskInfo `json:"disks"`
}
// FormatResult mirrors POST /disks/format (the success/refusal payload).
type FormatResult struct {
VMID int `json:"vmid"`
Device string `json:"device"`
Formatted bool `json:"formatted"`
DataBearing bool `json:"data_bearing"`
Reason string `json:"reason"`
// Role is the agent's tier for this device (system | backup | user-data).
Role string `json:"role,omitempty"`
// NeedsConfirmation is set on a USER-DATA data-bearing refusal: re-submit with confirmed=true +
// DurableID after the type-to-confirm UI (NOT an operator signature).
NeedsConfirmation bool `json:"needs_confirmation,omitempty"`
DurableID string `json:"durable_id,omitempty"`
// PendingOp is set on a SYSTEM/BACKUP data-bearing refusal — the operator-signature op.
PendingOp *PendingOp `json:"pending_op,omitempty"`
}
// PendingOp mirrors the agent's bound destructive intent on a data-bearing refusal. The controller
// surfaces the exact `felhom-opsign` command from it — it CANNOT complete a destructive format itself.
type PendingOp struct {
Op string `json:"op"` // e.g. "storage_wipe"
HostScope string `json:"host_scope"` // the agent's host id (anti-retarget)
DurableID string `json:"durable_id"` // byid:…|byuuid:… — the device's stable identity
FSType string `json:"fstype"` // the filesystem to mkfs after the wipe
}
// OpsignCommand returns the literal command the operator must run offline to authorize the wipe.
func (p PendingOp) OpsignCommand() string {
return fmt.Sprintf("felhom-opsign -op %s -host %s -durable-id %s", p.Op, p.HostScope, p.DurableID)
}
// ErrFormatRefused is returned by FormatDisk when the agent refuses a data-bearing format on a
// SYSTEM/BACKUP device (operator signature required). The UI surfaces the pending opsign command.
var ErrFormatRefused = fmt.Errorf("agentapi: format refused — system/backup device (operator authorization required)")
// ErrNeedsConfirmation is returned by FormatDisk when the agent refuses a data-bearing format on a
// USER-DATA device pending the CUSTOMER's informed-confirmation (bound to FormatResult.DurableID).
// The UI surfaces the type-to-confirm flow, then re-submits with confirmed=true + that durable id.
var ErrNeedsConfirmation = fmt.Errorf("agentapi: format needs customer confirmation — user-data device")
// Disks lists the host drives the agent manages, with a data-bearing flag per drive.
func (c *Client) Disks(ctx context.Context) (DisksResponse, error) {
var out DisksResponse
body, err := c.get(ctx, "/disks")
if err != nil {
return out, err
}
if err := json.Unmarshal(body, &out); err != nil {
return out, fmt.Errorf("agentapi: decode /disks: %w", err)
}
return out, nil
}
// AssignDisk attaches a drive (by fs-UUID) as a host mount (benign, self-serve).
func (c *Client) AssignDisk(ctx context.Context, uuid, where, fstype, options string) error {
_, err := c.post(ctx, "/disks/assign", map[string]string{
"uuid": uuid, "where": where, "fstype": fstype, "options": options,
})
return err
}
// GuestAttach binds an enrolled drive's felhom-data namespace into THIS guest (slice 10 P2, Model A).
// The drive must already be host-mounted at `where` (the enroll flow's assign did that). Idempotent on
// the agent side (returns the existing slot if already bound). Returns nil on success.
func (c *Client) GuestAttach(ctx context.Context, where string) error {
_, err := c.post(ctx, "/disks/guest-attach", map[string]string{"where": where})
return err
}
// GuestReboot reboots THIS guest to activate persisted-but-inactive drive binds (slice 10 P2
// activation — the host-side live inject is blocked on an unprivileged guest, so a drive enrolled into
// a running guest activates only at the next boot). The agent runs the reboot detached + returns 202;
// this guest (and the controller) restarts shortly after. User-triggered ("Újraindítás most").
func (c *Client) GuestReboot(ctx context.Context) error {
_, err := c.post(ctx, "/guest/reboot", struct{}{})
return err
}
// EjectResult mirrors POST /disks/eject (the dependent-guest warning).
type EjectResult struct {
VMID int `json:"vmid"`
Ejected string `json:"ejected"`
DependentGuests []int `json:"dependent_guests"`
}
// EjectDisk safe-unmounts a host mount (data preserved) and returns the dependent guests.
func (c *Client) EjectDisk(ctx context.Context, where string) (EjectResult, error) {
var out EjectResult
body, err := c.post(ctx, "/disks/eject", map[string]string{"where": where})
if err != nil {
return out, err
}
if err := json.Unmarshal(body, &out); err != nil {
return out, fmt.Errorf("agentapi: decode /disks/eject: %w", err)
}
return out, nil
}
// FormatDisk asks the agent to format a device. The AGENT inspects the device and tiers it by ROLE
// (its own classification, never the controller's claim):
// - blank device → formatted.
// - user-data, data-bearing, NOT confirmed → ErrNeedsConfirmation (out.DurableID = the id to confirm).
// - user-data, data-bearing, confirmed + matching durable id → formatted.
// - system/backup, data-bearing → ErrFormatRefused (out.PendingOp = the operator opsign command).
//
// confirmed + durableID authorize a user-data wipe (the durable id the agent gave on the prior
// ErrNeedsConfirmation); they are inert for system/backup.
func (c *Client) FormatDisk(ctx context.Context, device, fstype string, confirmed bool, durableID string) (FormatResult, error) {
var out FormatResult
// Status-aware POST: the agent returns the FULL FormatResponse (incl. pending_op / durable_id)
// even on the 403 refusal, so we must read the body on non-2xx rather than discarding it.
data, status, err := c.postWithStatus(ctx, "/disks/format", map[string]any{
"device": device, "fstype": fstype, "confirmed": confirmed, "durable_id": durableID,
})
if err != nil {
return out, err
}
// data is the envelope's {data:…} payload (present on both success and the 403 refusal).
if len(data) > 0 {
_ = json.Unmarshal(data, &out) // best-effort; fields default on a missing/partial body
}
if out.Formatted {
return out, nil
}
if out.NeedsConfirmation {
out.DataBearing = true
return out, ErrNeedsConfirmation // user-data: surface the type-to-confirm flow
}
if status == http.StatusForbidden || (out.DataBearing && !out.Formatted) {
out.DataBearing = true
return out, ErrFormatRefused // system/backup: surface the opsign command
}
return out, nil
}
// postWithStatus issues an authenticated JSON POST and returns the envelope's data payload + the HTTP
// status, even on a non-2xx (so callers like FormatDisk can read a 403 refusal body). A transport or
// envelope-parse failure is still an error; an `ok:false` business refusal is NOT (the data carries it).
func (c *Client) postWithStatus(ctx context.Context, path string, body any) (json.RawMessage, int, error) {
buf, err := json.Marshal(body)
if err != nil {
return nil, 0, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(buf))
if err != nil {
return nil, 0, err
}
req.Header.Set("Authorization", "Bearer "+c.token)
req.Header.Set("Content-Type", "application/json")
resp, err := c.hc.Do(req)
if err != nil {
return nil, 0, fmt.Errorf("agentapi: POST %s: %w", path, err)
}
defer resp.Body.Close()
raw, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
var env apiResponse
if err := json.Unmarshal(raw, &env); err != nil {
return nil, resp.StatusCode, fmt.Errorf("agentapi: POST %s: HTTP %d, bad envelope: %w", path, resp.StatusCode, err)
}
return env.Data, resp.StatusCode, nil
}
// ---- slice 9: host metrics (the customer host-health view) -------------------------------
// HostMetrics mirrors the agent's GET /host/metrics `host` block (shared HostMetrics wire shape).
// CPUTempC is a pointer so a host with no temp sensor is null ("n/a"), distinct from a real 0.
type HostMetrics struct {
Node string `json:"node"`
CPUPercent float64 `json:"cpu_percent"` // 0100
MemoryTotalBytes int64 `json:"memory_total_bytes"`
MemoryUsedBytes int64 `json:"memory_used_bytes"`
MemoryPercent float64 `json:"memory_percent"`
DiskTotalBytes int64 `json:"disk_total_bytes"` // host root fs
DiskUsedBytes int64 `json:"disk_used_bytes"`
DiskPercent float64 `json:"disk_percent"`
LoadAvg []string `json:"loadavg"`
UptimeSeconds int64 `json:"uptime_seconds"`
CPUTempC *int `json:"cpu_temp_c"` // °C or null ("n/a")
}
// ThinPoolFill mirrors the agent's lvmthin pool fill (a full thin-pool corrupts every guest on it).
type ThinPoolFill struct {
DataUsedFraction float64 `json:"data_used_fraction"`
MetadataUsedFraction *float64 `json:"metadata_used_fraction"`
}
// SmartSummary mirrors the agent's per-disk SMART health (only the fields the UI renders). Pointers
// are null when the device type does not expose that attribute.
type SmartSummary struct {
Health string `json:"health"` // PASSED | FAILING | UNKNOWN
TemperatureC *int `json:"temperature_c"`
PercentageUsed *int `json:"percentage_used"` // NVMe wear (%); null for SATA/USB
}
// StorageTarget mirrors the agent's GET /host/metrics storage_targets entry (the per-storage
// capacity + health the monitoring view renders). It is a SUBSET of the agent's wire shape — only
// the fields the UI reads; unknown JSON keys are ignored.
type StorageTarget struct {
Name string `json:"name"`
Type string `json:"type"`
State string `json:"state"`
Reachable bool `json:"reachable"`
TotalBytes int64 `json:"total_bytes"`
UsedBytes int64 `json:"used_bytes"`
AvailBytes int64 `json:"avail_bytes"`
UsedFraction float64 `json:"used_fraction"`
Content string `json:"content"`
MountPath string `json:"mount_path"`
ClassHint string `json:"class_hint"`
ThinPool *ThinPoolFill `json:"thin_pool,omitempty"`
Smart SmartSummary `json:"smart"`
// Label and Purpose are controller-side display enrichment (NOT from the agent): a friendly
// Hungarian name + one-line purpose so the customer understands what each storage holds. The
// raw PVE storage id stays in Name (display-only labels — we never rename the actual storage).
Label string `json:"label,omitempty"`
Purpose string `json:"purpose,omitempty"`
}
// HostMetricsResponse mirrors the agent's GET /host/metrics payload (host-wide health + per-storage
// capacity). Host-wide and token-authed (one-customer-per-host); a fresh collect, not a snapshot.
type HostMetricsResponse struct {
VMID int `json:"vmid"`
Host HostMetrics `json:"host"`
StorageTargets []StorageTarget `json:"storage_targets"`
}
// HostMetrics calls GET /host/metrics and returns the host's live health + per-storage capacity.
func (c *Client) HostMetrics(ctx context.Context) (HostMetricsResponse, error) {
var out HostMetricsResponse
body, err := c.get(ctx, "/host/metrics")
if err != nil {
return out, err
}
if err := json.Unmarshal(body, &out); err != nil {
return out, fmt.Errorf("agentapi: decode /host/metrics: %w", err)
}
return out, nil
}
// get issues an authenticated GET and unwraps the {ok,data,error} envelope.
func (c *Client) get(ctx context.Context, path string) (json.RawMessage, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil)
if err != nil {
return nil, err
}
req.Header.Set("Authorization", "Bearer "+c.token)
resp, err := c.hc.Do(req)
if err != nil {
return nil, fmt.Errorf("agentapi: GET %s: %w", path, err)
}
defer resp.Body.Close()
raw, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("agentapi: GET %s: HTTP %d", path, resp.StatusCode)
}
var env apiResponse
if err := json.Unmarshal(raw, &env); err != nil {
return nil, fmt.Errorf("agentapi: GET %s: bad envelope: %w", path, err)
}
if !env.OK {
return nil, fmt.Errorf("agentapi: GET %s: %s", path, env.Error)
}
return env.Data, nil
}
// post issues an authenticated JSON POST and unwraps the {ok,data,error} envelope. The agent
// returns 200 or 202 for accepted requests.
func (c *Client) post(ctx context.Context, path string, body any) (json.RawMessage, error) {
buf, err := json.Marshal(body)
if err != nil {
return nil, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(buf))
if err != nil {
return nil, err
}
req.Header.Set("Authorization", "Bearer "+c.token)
req.Header.Set("Content-Type", "application/json")
resp, err := c.hc.Do(req)
if err != nil {
return nil, fmt.Errorf("agentapi: POST %s: %w", path, err)
}
defer resp.Body.Close()
raw, _ := io.ReadAll(io.LimitReader(resp.Body, 1<<20))
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusAccepted {
return nil, fmt.Errorf("agentapi: POST %s: HTTP %d", path, resp.StatusCode)
}
var env apiResponse
if err := json.Unmarshal(raw, &env); err != nil {
return nil, fmt.Errorf("agentapi: POST %s: bad envelope: %w", path, err)
}
if !env.OK {
return nil, fmt.Errorf("agentapi: POST %s: %s", path, env.Error)
}
return env.Data, nil
}
// normalizeFingerprint lowercases and strips ':'/' ' separators, requiring a 64-hex SHA-256.
func normalizeFingerprint(fp string) (string, error) {
s := strings.ToLower(strings.NewReplacer(":", "", " ", "", "\t", "").Replace(strings.TrimSpace(fp)))
if len(s) != 64 {
return "", fmt.Errorf("agentapi: fingerprint must be a SHA-256 (64 hex chars), got %d", len(s))
}
if _, err := hex.DecodeString(s); err != nil {
return "", fmt.Errorf("agentapi: fingerprint is not valid hex: %w", err)
}
return s, nil
}