From a042316d6d555b3c8c41f9188905411647032f5b Mon Sep 17 00:00:00 2001 From: kisfenyo Date: Mon, 8 Jun 2026 14:34:32 +0200 Subject: [PATCH] feat(agent): scaffold + proxmox interaction layer (slice 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stand up the felhom-agent project (module gitea.dooplex.hu/admin/felhom-agent, binary felhom-agent) and the internal/proxmox package: the typed library every other agent module calls to talk to Proxmox. - API-first Client (hand-rolled REST over net/http, PVEAPIToken auth) with typed read ops (version/nodes/status/lxc/config/storage) and async mutating ops (restore/vzdump/snapshot/rollback/delete-snapshot/setconfig/start/stop), each returning a UPID. WaitTask polls task status until stopped and asserts exitstatus OK (authz can surface at task exec, not the POST — phase1-2 §1.3). - Fenced Privileged (root-CLI) backend for the THREE proven exceptions only (keyctl pct create, USB mount/fstab, SMART/sensors); each cites why it can't be the API. Fence is structural (Client never shells out, Privileged never HTTPs) and asserted in routing_test.go. - TLS: SHA-256 leaf-cert pinning or CA file; insecure mode explicit + off by default. No blanket verification disable. - 403 -> privilege-named APIError; failed task -> privilege-named TaskError. - JSON config + env overrides (token never logged); slog logging. - cmd/felhom-agent --selftest (read-only health report) + gated --selftest=task (reversible snapshot/rollback/delete exercise of WaitTask). No daemon loop yet. - Types grounded in the spike findings and exact JSON shapes captured live from demo-felhom (PVE 9.2.2). Unit tests use a mock transport + runner. Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitignore | 11 ++ README.md | 126 +++++++++++++++++ cmd/felhom-agent/main.go | 234 +++++++++++++++++++++++++++++++ configs/agent.example.json | 17 +++ go.mod | 3 + internal/config/config.go | 145 +++++++++++++++++++ internal/config/config_test.go | 59 ++++++++ internal/log/log.go | 28 ++++ internal/proxmox/client.go | 154 ++++++++++++++++++++ internal/proxmox/client_test.go | 102 ++++++++++++++ internal/proxmox/doc.go | 62 ++++++++ internal/proxmox/errors.go | 81 +++++++++++ internal/proxmox/mock_test.go | 50 +++++++ internal/proxmox/mutate.go | 148 +++++++++++++++++++ internal/proxmox/privileged.go | 203 +++++++++++++++++++++++++++ internal/proxmox/query.go | 78 +++++++++++ internal/proxmox/routing_test.go | 97 +++++++++++++ internal/proxmox/task.go | 141 +++++++++++++++++++ internal/proxmox/task_test.go | 81 +++++++++++ internal/proxmox/tls.go | 88 ++++++++++++ internal/proxmox/tls_test.go | 46 ++++++ internal/proxmox/types.go | 164 ++++++++++++++++++++++ internal/proxmox/upid.go | 63 +++++++++ internal/proxmox/upid_test.go | 59 ++++++++ 24 files changed, 2240 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 cmd/felhom-agent/main.go create mode 100644 configs/agent.example.json create mode 100644 go.mod create mode 100644 internal/config/config.go create mode 100644 internal/config/config_test.go create mode 100644 internal/log/log.go create mode 100644 internal/proxmox/client.go create mode 100644 internal/proxmox/client_test.go create mode 100644 internal/proxmox/doc.go create mode 100644 internal/proxmox/errors.go create mode 100644 internal/proxmox/mock_test.go create mode 100644 internal/proxmox/mutate.go create mode 100644 internal/proxmox/privileged.go create mode 100644 internal/proxmox/query.go create mode 100644 internal/proxmox/routing_test.go create mode 100644 internal/proxmox/task.go create mode 100644 internal/proxmox/task_test.go create mode 100644 internal/proxmox/tls.go create mode 100644 internal/proxmox/tls_test.go create mode 100644 internal/proxmox/types.go create mode 100644 internal/proxmox/upid.go create mode 100644 internal/proxmox/upid_test.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8465bde --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +# build output +/felhom-agent +/felhom-agent.exe +/dist/ + +# local config that may carry a real token secret +/agent.json +*.local.json + +# go +/vendor/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..e27b353 --- /dev/null +++ b/README.md @@ -0,0 +1,126 @@ +# felhom-agent + +The **host agent** for the Felhom platform — the operator-tier component that runs on each +Proxmox host and owns *all* Proxmox interaction (provision/restore guests, host storage, +backups, host+tunnel monitoring, hub control loop, per-guest local API). Design: +[`felhom.eu/documentation/architecture/03-host-agent.md`](https://gitea.dooplex.hu/admin/felhom.eu/raw/branch/main/documentation/architecture/03-host-agent.md). + +> **Status — slice 1 of N.** This repo currently contains the project scaffold and the +> **`internal/proxmox`** interaction layer (the typed library every other module will call to +> talk to Proxmox), plus a runnable read-only `--selftest`. **No** reconcile loop, hub client, +> signing, or storage/backup orchestration yet — those are later slices. + +Module: `gitea.dooplex.hu/admin/felhom-agent` · binary: `felhom-agent` · Go 1.24. + +## Layout + +``` +cmd/felhom-agent/ # entry point + --selftest (wiring only; no daemon loop yet) +internal/proxmox/ # the Proxmox interaction layer (API-first + fenced root-CLI) +internal/config/ # JSON config + env overrides (secrets never logged) +internal/log/ # slog setup +configs/agent.example.json +``` + +## The `proxmox` package — model + +Two backends, one fixed routing policy (the fence is structural — `Client` never shells out, +`Privileged` never makes an HTTP call; asserted in `routing_test.go`): + +| | Backend | Used for | +|---|---|---| +| **API (default)** | `proxmox.Client` | everything the scoped **FelhomAgent** token can do | +| **root-CLI (fenced)** | `proxmox.Privileged` | the **three** proven OS-root exceptions only | + +Grounded entirely in the spike findings (`felhom.eu/documentation/proxmox-platform.md`, +`tests/phase{0,1-2,3}-findings.md`). Every mutating API op is **async**: it returns a UPID and +the caller `WaitTask`s until the task stops, then asserts `exitstatus == "OK"` — authorization +can surface at task execution, not the HTTP POST (phase1-2 §1.3). + +### Public surface + +`Client` (API): + +- Read: `Version`, `Nodes`, `NodeStatus`, `ListLXC`, `GuestStatus`, `GuestConfig`, + `ListStorage`, `NodeStorage`, `StorageContent`. +- Async mutating (return UPID): `RestoreLXC` (primary create path), `Vzdump`, `Snapshot`, + `Rollback`, `DeleteSnapshot`, `SetConfig`, `Start`, `Stop`. +- Tasks: `WaitTask`, `TaskStatusOnce`, `TaskLogTail`. +- Errors: `*APIError` (parses the offending privilege from a 403), `*TaskError` (parses it from + a failed task `exitstatus`). + +`Privileged` (fenced root-CLI) — each method documents *why it can't be the API*: + +- `CreateGoldenLXC` — `pct create` with `keyctl=1` (root@pam-only; the only root-fenced create — + the per-customer path provisions by **restore**, which preserves keyctl). +- `MountUSBByUUID` — host mount-by-UUID (not a Proxmox API op). +- `SMART`, `Sensors` — hardware reads (not API-exposed). + +### API-vs-root routing table + +See the table in [`internal/proxmox/doc.go`](internal/proxmox/doc.go). Summary: the entire guest +lifecycle **including restore** is API-token-covered; OS-root is confined to golden-image +`keyctl` create, host mounts, and SMART/sensors (phase3 §B3). + +### TLS trust + +The host serves a self-signed cert. Verification is **not** blanket-disabled. Pick one in +config: `ca_file` (PEM, full verify), `fingerprint` (SHA-256 of the host leaf cert — pinned +exact-cert match; the `/nodes` API returns each node's `ssl_fingerprint` to pin), or the +explicitly-named `insecure_skip_verify` (off by default; selftest-against-127.0.0.1 only). + +## Provisioning the token (out-of-band, operator side) + +The agent only **consumes** a privilege-separated API token; role setup is a provisioning step. +The role must be granted on **both the user AND the token** for the same path, or the +intersection is empty and every call 403s (phase1-2 §1.2): + +```bash +pveum role add FelhomAgent -privs "VM.Allocate VM.Audit VM.Config.Disk VM.Config.CPU \ + VM.Config.Memory VM.Config.Network VM.Config.Options VM.PowerMgmt VM.Snapshot \ + VM.Snapshot.Rollback VM.Backup Datastore.Allocate Datastore.AllocateSpace \ + Datastore.Audit Sys.Audit SDN.Use" # 16 privileges, validated Phase 3 B3 +pveum user add felhom-agent@pve +pveum user token add felhom-agent@pve agent --privsep 1 # capture the secret (shown once) +pveum acl modify / -user 'felhom-agent@pve' -role FelhomAgent +pveum acl modify / -token 'felhom-agent@pve!agent' -role FelhomAgent +``` + +(`VM.Config.CPUMemory` is **not** a real privilege; `SDN.Use` **is** required for bridge use.) + +## Run + +```bash +go build ./... +# read-only health check against the host: +./felhom-agent --config configs/agent.example.json --selftest +# or via env (keeps the secret off disk): +FELHOM_AGENT_PROXMOX_TOKEN='felhom-agent@pve!agent=SECRET' \ +FELHOM_AGENT_PROXMOX_NODE=demo-felhom \ +FELHOM_AGENT_PROXMOX_ENDPOINT=https://192.168.0.162:8006 \ +FELHOM_AGENT_PROXMOX_TLS_FINGERPRINT='BA:7C:...:CF' \ + ./felhom-agent --selftest +``` + +`--selftest` (read-only) loads config, builds the API client, and runs the read queries (version, +nodes, node status, guests, storage), printing a short health report. It mutates nothing and says +so cleanly if the token/endpoint isn't configured. + +`--selftest=task --vmid N` (explicitly gated) exercises `WaitTask` on a **reversible** op +(snapshot → rollback → delete-snapshot) against guest `N`. Default `--selftest` never mutates. + +## Process model (proposed, not finalized — see 03 §3/§12) + +Native Go binary, systemd service, **non-root** service user holding the scoped token, with a +**narrow sudoers allowlist** for the three fenced ops. `privileged.mode: "sudo"` matches this; +`"direct"` is for dev/CI where the agent is already root. + +## Test + +```bash +go vet ./... && go test ./... +``` + +Unit tests use a mock HTTP transport + mock runner (no live host): UPID parse, `WaitTask` +(running→OK / running→failed-403 / timeout / ctx-cancel), 403→privilege-named error, response +decoding against the captured live shapes, and the API-vs-root routing fence. diff --git a/cmd/felhom-agent/main.go b/cmd/felhom-agent/main.go new file mode 100644 index 0000000..cc55f6d --- /dev/null +++ b/cmd/felhom-agent/main.go @@ -0,0 +1,234 @@ +// Command felhom-agent is the host agent (slice 1: scaffold + proxmox layer). +// +// This slice is wiring only: it has no daemon/reconcile loop yet (slice 3/4). It +// exposes a read-only --selftest that exercises the proxmox package against a live +// host, and an explicitly-gated --selftest=task that exercises WaitTask on a +// reversible op (snapshot -> rollback -> delete-snapshot). +package main + +import ( + "context" + "errors" + "flag" + "fmt" + "log/slog" + "os" + "os/signal" + "syscall" + "time" + + "gitea.dooplex.hu/admin/felhom-agent/internal/config" + applog "gitea.dooplex.hu/admin/felhom-agent/internal/log" + "gitea.dooplex.hu/admin/felhom-agent/internal/proxmox" +) + +func main() { + var ( + cfgPath string + selftest selftestFlag + vmid int + ) + flag.StringVar(&cfgPath, "config", envOr("FELHOM_AGENT_CONFIG", "/etc/felhom-agent/agent.json"), "path to the agent config file (JSON)") + flag.Var(&selftest, "selftest", "run a self-test and exit: bare/`read` = read-only queries; `task` = reversible mutating exercise (needs -vmid)") + flag.IntVar(&vmid, "vmid", 0, "guest VMID for --selftest=task (the reversible snapshot/rollback exercise)") + flag.Parse() + + cfg, err := config.Load(cfgPath) + if err != nil { + // A missing default config file is fine if env provides the values; only a + // present-but-unreadable/invalid file is fatal here. + if !(os.IsNotExist(errors.Unwrap(err)) && cfgPath == flag.Lookup("config").DefValue) { + fmt.Fprintln(os.Stderr, "config error:", err) + os.Exit(2) + } + cfg = config.Default() + } + logger := applog.New(cfg.LogLevel) + + switch selftest.mode { + case "": + // No daemon loop yet. + logger.Info("felhom-agent slice-1 scaffold; no run loop yet", + "hint", "use --selftest (read-only) or --selftest=task --vmid N") + // TODO: poll loop — slice 3/4. + return + case "read": + os.Exit(runSelftestRead(context.Background(), cfg, logger)) + case "task": + os.Exit(runSelftestTask(context.Background(), cfg, logger, vmid)) + } +} + +// runSelftestRead loads config, builds the API client, and runs the read-only +// queries against the live host, printing a short health report. It mutates +// nothing. Missing/invalid config is reported cleanly (no panic). +func runSelftestRead(ctx context.Context, cfg config.Config, logger *slog.Logger) int { + if err := cfg.Validate(); err != nil { + fmt.Fprintln(os.Stderr, "selftest: not configured:", err) + return 1 + } + logger.Info("selftest (read-only) starting", "config", fmt.Sprintf("%+v", cfg.Redacted().Proxmox)) + + client, err := proxmox.NewClient(proxmox.Config{ + Endpoint: cfg.Proxmox.Endpoint, + Node: cfg.Proxmox.Node, + Token: cfg.Proxmox.Token, + TLS: proxmox.TLSConfig{ + CAFile: cfg.Proxmox.TLS.CAFile, + Fingerprint: cfg.Proxmox.TLS.Fingerprint, + InsecureSkipVerify: cfg.Proxmox.TLS.InsecureSkipVerify, + }, + }) + if err != nil { + fmt.Fprintln(os.Stderr, "selftest: client init:", err) + return 1 + } + + ctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + + fmt.Println("=== felhom-agent selftest (read-only) ===") + fmt.Printf("endpoint : %s node=%s\n", cfg.Proxmox.Endpoint, cfg.Proxmox.Node) + + fail := 0 + report := func(label string, err error) bool { + if err != nil { + fmt.Printf(" [FAIL] %-14s %v\n", label, err) + fail++ + return false + } + return true + } + + if v, err := client.Version(ctx); report("version", err) { + fmt.Printf(" [ ok ] %-14s PVE %s (release %s)\n", "version", v.Version, v.Release) + } + if nodes, err := client.Nodes(ctx); report("nodes", err) { + fmt.Printf(" [ ok ] %-14s %d node(s)\n", "nodes", len(nodes)) + for _, n := range nodes { + marker := " " + if n.Node == cfg.Proxmox.Node { + marker = "* " + } + fmt.Printf(" %s%s status=%s fp=%s…\n", marker, n.Node, n.Status, head(n.SSLFingerprint, 17)) + } + } + if s, err := client.NodeStatus(ctx); report("node status", err) { + fmt.Printf(" [ ok ] %-14s up %s, load %v, mem %s/%s, root %s/%s\n", "node status", + dur(s.Uptime), s.LoadAvg, + gib(s.Memory.Used), gib(s.Memory.Total), gib(s.RootFS.Used), gib(s.RootFS.Total)) + } + if gs, err := client.ListLXC(ctx); report("list lxc", err) { + fmt.Printf(" [ ok ] %-14s %d guest(s)\n", "list lxc", len(gs)) + for _, g := range gs { + fmt.Printf(" - %d %q status=%s\n", g.VMID, g.Name, g.Status) + } + } + if ss, err := client.NodeStorage(ctx); report("storage", err) { + fmt.Printf(" [ ok ] %-14s %d store(s)\n", "storage", len(ss)) + for _, s := range ss { + fmt.Printf(" - %-10s type=%-8s content=%s used=%s/%s\n", + s.Storage, s.Type, s.Content, gib(s.Used), gib(s.Total)) + } + } + + if fail > 0 { + fmt.Printf("=== selftest FAILED (%d check(s)) ===\n", fail) + return 1 + } + fmt.Println("=== selftest OK ===") + return 0 +} + +// runSelftestTask exercises WaitTask on a reversible op against -vmid: snapshot -> +// rollback -> delete-snapshot. Explicitly gated; never runs under bare --selftest. +func runSelftestTask(ctx context.Context, cfg config.Config, logger *slog.Logger, vmid int) int { + if err := cfg.Validate(); err != nil { + fmt.Fprintln(os.Stderr, "selftest: not configured:", err) + return 1 + } + if vmid == 0 { + fmt.Fprintln(os.Stderr, "selftest=task requires -vmid N (a guest safe to snapshot/rollback)") + return 2 + } + client, err := proxmox.NewClient(proxmox.Config{ + Endpoint: cfg.Proxmox.Endpoint, Node: cfg.Proxmox.Node, Token: cfg.Proxmox.Token, + TLS: proxmox.TLSConfig{ + CAFile: cfg.Proxmox.TLS.CAFile, Fingerprint: cfg.Proxmox.TLS.Fingerprint, + InsecureSkipVerify: cfg.Proxmox.TLS.InsecureSkipVerify, + }, + }) + if err != nil { + fmt.Fprintln(os.Stderr, "selftest: client init:", err) + return 1 + } + + // Ctrl-C aborts the wait cleanly. + ctx, stop := signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM) + defer stop() + + const snap = "felhom-selftest" + steps := []struct { + name string + do func() (string, error) + }{ + {"snapshot", func() (string, error) { return client.Snapshot(ctx, vmid, snap, "felhom-agent selftest") }}, + {"rollback", func() (string, error) { return client.Rollback(ctx, vmid, snap) }}, + {"delete-snapshot", func() (string, error) { return client.DeleteSnapshot(ctx, vmid, snap) }}, + } + fmt.Printf("=== felhom-agent selftest=task (vmid %d, snapshot %q) ===\n", vmid, snap) + for _, st := range steps { + upid, err := st.do() + if err != nil { + fmt.Printf(" [FAIL] %-16s %v\n", st.name, err) + return 1 + } + fmt.Printf(" .... %-16s upid=%s\n", st.name, upid) + status, err := client.WaitTask(ctx, upid, proxmox.WaitOptions{}) + if err != nil { + fmt.Printf(" [FAIL] %-16s %v\n", st.name, err) + return 1 + } + fmt.Printf(" [ ok ] %-16s exitstatus=%s\n", st.name, status.ExitStatus) + } + fmt.Println("=== selftest=task OK ===") + return 0 +} + +// --- small helpers / flag type --- + +func envOr(key, def string) string { + if v := os.Getenv(key); v != "" { + return v + } + return def +} + +func head(s string, n int) string { + if len(s) <= n { + return s + } + return s[:n] +} + +func dur(seconds int64) string { return (time.Duration(seconds) * time.Second).String() } + +func gib(bytes int64) string { return fmt.Sprintf("%.1fGiB", float64(bytes)/(1<<30)) } + +// selftestFlag is a flag.Value that also satisfies IsBoolFlag, so `--selftest` +// works bare (read-only) and `--selftest=task` / `--selftest=read` set the mode. +type selftestFlag struct{ mode string } + +func (f *selftestFlag) String() string { return f.mode } +func (f *selftestFlag) IsBoolFlag() bool { return true } +func (f *selftestFlag) Set(v string) error { + switch v { + case "true", "", "read": + f.mode = "read" + case "task": + f.mode = "task" + default: + return fmt.Errorf("invalid --selftest value %q (want read|task)", v) + } + return nil +} diff --git a/configs/agent.example.json b/configs/agent.example.json new file mode 100644 index 0000000..b43a896 --- /dev/null +++ b/configs/agent.example.json @@ -0,0 +1,17 @@ +{ + "proxmox": { + "endpoint": "https://127.0.0.1:8006", + "node": "demo-felhom", + "token": "felhom-agent@pve!agent=REPLACE_WITH_SECRET", + "tls": { + "ca_file": "", + "fingerprint": "BA:7C:99:7D:45:D0:67:91:E2:F2:72:74:6E:D6:9F:83:51:D1:61:E5:C3:BD:F6:A0:B8:0B:E3:D8:DB:89:5B:CF", + "insecure_skip_verify": false + } + }, + "privileged": { + "mode": "sudo", + "sudo_path": "sudo" + }, + "log_level": "info" +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..ad3f32f --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module gitea.dooplex.hu/admin/felhom-agent + +go 1.24 diff --git a/internal/config/config.go b/internal/config/config.go new file mode 100644 index 0000000..723c0a1 --- /dev/null +++ b/internal/config/config.go @@ -0,0 +1,145 @@ +// Package config loads the felhom-agent configuration the proxmox layer needs. +// +// Format: a JSON file (stdlib-only — no YAML dep, consistent with the agent's +// "pure stdlib" constraint), with per-field environment overrides. Secrets (the +// API token) are never logged; see Config.Redacted. +// +// OPEN item (noted in the slice reply): the controller/hub use YAML; if matching +// that house style is preferred over the zero-dependency constraint, the loader +// can swap to yaml.v3 without touching call sites. +package config + +import ( + "encoding/json" + "fmt" + "os" + "strconv" + "strings" +) + +// Config is the agent configuration. Only the fields the proxmox interaction +// layer needs are present in this slice. +type Config struct { + Proxmox ProxmoxConfig `json:"proxmox"` + Privileged PrivilegedConfig `json:"privileged"` + LogLevel string `json:"log_level"` // debug|info|warn|error (default info) +} + +// ProxmoxConfig configures the API client. +type ProxmoxConfig struct { + // Endpoint defaults to https://127.0.0.1:8006 (agent runs on the host). + Endpoint string `json:"endpoint"` + // Node is the Proxmox node name; confirm on the box (GET /nodes). + Node string `json:"node"` + // Token is the full API token "USER@REALM!TOKENID=SECRET". + // + // Provisioning note: this is a privilege-SEPARATED token. Its role + // (FelhomAgent, 16 privileges) must be granted on BOTH the user AND the token + // for the same path, or the intersection is empty and every call 403s + // (phase1-2 §1.2). Role setup is out-of-band; the agent only consumes the token. + Token string `json:"token"` + // TLS trust to the host's (self-signed) cert. + TLS TLSTrust `json:"tls"` +} + +// TLSTrust mirrors proxmox.TLSConfig (kept dependency-free here). +type TLSTrust struct { + CAFile string `json:"ca_file"` + Fingerprint string `json:"fingerprint"` // SHA-256 of the host leaf cert + InsecureSkipVerify bool `json:"insecure_skip_verify"` // off by default; selftest-only +} + +// PrivilegedConfig configures the fenced root-CLI runner. +type PrivilegedConfig struct { + // Mode: "sudo" (default — non-root agent + narrow sudoers) or "direct". + Mode string `json:"mode"` + // SudoPath overrides the sudo binary (default "sudo"). + SudoPath string `json:"sudo_path"` +} + +// Default returns a Config pre-populated with sane defaults. +func Default() Config { + return Config{ + Proxmox: ProxmoxConfig{Endpoint: "https://127.0.0.1:8006"}, + Privileged: PrivilegedConfig{Mode: "sudo"}, + LogLevel: "info", + } +} + +// Load reads the config file at path (if non-empty) over the defaults, then +// applies environment overrides. A missing path with all-env config is allowed. +func Load(path string) (Config, error) { + cfg := Default() + if path != "" { + b, err := os.ReadFile(path) + if err != nil { + return cfg, fmt.Errorf("config: reading %s: %w", path, err) + } + if err := json.Unmarshal(b, &cfg); err != nil { + return cfg, fmt.Errorf("config: parsing %s: %w", path, err) + } + } + applyEnv(&cfg) + return cfg, nil +} + +// applyEnv overlays FELHOM_AGENT_* environment variables. Useful for the token in +// particular (keep the secret out of the file on disk if desired). +func applyEnv(cfg *Config) { + if v := os.Getenv("FELHOM_AGENT_PROXMOX_ENDPOINT"); v != "" { + cfg.Proxmox.Endpoint = v + } + if v := os.Getenv("FELHOM_AGENT_PROXMOX_NODE"); v != "" { + cfg.Proxmox.Node = v + } + if v := os.Getenv("FELHOM_AGENT_PROXMOX_TOKEN"); v != "" { + cfg.Proxmox.Token = v + } + if v := os.Getenv("FELHOM_AGENT_PROXMOX_TLS_CA_FILE"); v != "" { + cfg.Proxmox.TLS.CAFile = v + } + if v := os.Getenv("FELHOM_AGENT_PROXMOX_TLS_FINGERPRINT"); v != "" { + cfg.Proxmox.TLS.Fingerprint = v + } + if v := os.Getenv("FELHOM_AGENT_PROXMOX_TLS_INSECURE"); v != "" { + if b, err := strconv.ParseBool(v); err == nil { + cfg.Proxmox.TLS.InsecureSkipVerify = b + } + } + if v := os.Getenv("FELHOM_AGENT_LOG_LEVEL"); v != "" { + cfg.LogLevel = v + } +} + +// Validate checks the config is usable for talking to the API. +func (c Config) Validate() error { + if c.Proxmox.Endpoint == "" { + return fmt.Errorf("config: proxmox.endpoint is required") + } + if c.Proxmox.Node == "" { + return fmt.Errorf("config: proxmox.node is required (confirm with `pvesh get /nodes`)") + } + if c.Proxmox.Token == "" { + return fmt.Errorf("config: proxmox.token is required (set proxmox.token or FELHOM_AGENT_PROXMOX_TOKEN)") + } + if !strings.Contains(c.Proxmox.Token, "!") || !strings.Contains(c.Proxmox.Token, "=") { + return fmt.Errorf("config: proxmox.token must be USER@REALM!TOKENID=SECRET") + } + return nil +} + +// Redacted returns a copy safe to log: the token secret is masked. +func (c Config) Redacted() Config { + if c.Proxmox.Token != "" { + c.Proxmox.Token = redactToken(c.Proxmox.Token) + } + return c +} + +// redactToken keeps the public "USER@REALM!TOKENID=" prefix and masks the secret. +func redactToken(tok string) string { + if i := strings.LastIndex(tok, "="); i >= 0 { + return tok[:i+1] + "********" + } + return "********" +} diff --git a/internal/config/config_test.go b/internal/config/config_test.go new file mode 100644 index 0000000..7da1a5e --- /dev/null +++ b/internal/config/config_test.go @@ -0,0 +1,59 @@ +package config + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestRedactedMasksSecret(t *testing.T) { + c := Default() + c.Proxmox.Token = "felhom-agent@pve!agent=b6547d9d-08ec-4f22-beb8-a551dc2cd69d" + got := c.Redacted().Proxmox.Token + if strings.Contains(got, "b6547d9d") { + t.Fatalf("secret leaked in redacted token: %q", got) + } + if !strings.HasPrefix(got, "felhom-agent@pve!agent=") { + t.Errorf("redacted token lost its public prefix: %q", got) + } + // The original must be untouched (Redacted returns a copy). + if !strings.Contains(c.Proxmox.Token, "b6547d9d") { + t.Errorf("Redacted mutated the original config") + } +} + +func TestValidate(t *testing.T) { + c := Default() + c.Proxmox.Node = "demo-felhom" + c.Proxmox.Token = "felhom-agent@pve!agent=secret" + if err := c.Validate(); err != nil { + t.Fatalf("valid config rejected: %v", err) + } + c.Proxmox.Token = "no-bang-no-eq" + if err := c.Validate(); err == nil { + t.Errorf("malformed token accepted") + } +} + +func TestLoadFileThenEnvOverride(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "agent.json") + if err := os.WriteFile(path, []byte(`{"proxmox":{"node":"file-node","token":"u@pve!t=filesecret"}}`), 0o600); err != nil { + t.Fatal(err) + } + t.Setenv("FELHOM_AGENT_PROXMOX_NODE", "env-node") + cfg, err := Load(path) + if err != nil { + t.Fatalf("Load: %v", err) + } + if cfg.Proxmox.Node != "env-node" { + t.Errorf("env did not override node: %q", cfg.Proxmox.Node) + } + if cfg.Proxmox.Token != "u@pve!t=filesecret" { + t.Errorf("token from file lost: %q", cfg.Proxmox.Token) + } + if cfg.Proxmox.Endpoint != "https://127.0.0.1:8006" { + t.Errorf("default endpoint lost: %q", cfg.Proxmox.Endpoint) + } +} diff --git a/internal/log/log.go b/internal/log/log.go new file mode 100644 index 0000000..2ed70a4 --- /dev/null +++ b/internal/log/log.go @@ -0,0 +1,28 @@ +// Package log builds the agent's slog logger. Kept tiny on purpose; the agent is +// a host service, so logs go to stderr (journald-friendly). Secrets must never be +// passed to the logger — config is logged only via Config.Redacted (see config). +package log + +import ( + "log/slog" + "os" + "strings" +) + +// New returns a text slog.Logger at the given level ("debug"|"info"|"warn"| +// "error"; unknown falls back to info), writing to stderr. +func New(level string) *slog.Logger { + var lvl slog.Level + switch strings.ToLower(level) { + case "debug": + lvl = slog.LevelDebug + case "warn", "warning": + lvl = slog.LevelWarn + case "error": + lvl = slog.LevelError + default: + lvl = slog.LevelInfo + } + h := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: lvl}) + return slog.New(h) +} diff --git a/internal/proxmox/client.go b/internal/proxmox/client.go new file mode 100644 index 0000000..b162059 --- /dev/null +++ b/internal/proxmox/client.go @@ -0,0 +1,154 @@ +package proxmox + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "time" +) + +// doer is the minimal HTTP surface the client needs; *http.Client satisfies it. +// Tests inject a mock to exercise decoding/error paths without a live host. +type doer interface { + Do(*http.Request) (*http.Response, error) +} + +// Config configures a Client (the API backend). +type Config struct { + // Endpoint is the API base, e.g. "https://127.0.0.1:8006". The "/api2/json" + // suffix is added by the client. + Endpoint string + // Node is the Proxmox node name (e.g. "demo-felhom"). Confirm on the box + // (GET /nodes), never hard-code — see proxmox-platform.md §1. + Node string + // Token is the full API token "USER@REALM!TOKENID=SECRET". Never logged. + Token string + // TLS selects how the host cert is trusted. + TLS TLSConfig + // HTTPTimeout bounds a single HTTP round-trip (not a whole task wait). + // Defaults to 30s. + HTTPTimeout time.Duration +} + +// Client is the API backend: a typed REST client for one Proxmox host. It is the +// default path for everything the scoped token can do. It never shells out. +type Client struct { + base string // "/api2/json" + node string + token string + http doer +} + +// NewClient builds an API client. It validates required config and constructs the +// TLS-pinned transport. +func NewClient(cfg Config) (*Client, error) { + if cfg.Endpoint == "" { + return nil, fmt.Errorf("proxmox: endpoint is required") + } + if cfg.Node == "" { + return nil, fmt.Errorf("proxmox: node is required") + } + if cfg.Token == "" { + return nil, fmt.Errorf("proxmox: API token is required") + } + tlsCfg, err := cfg.TLS.build() + if err != nil { + return nil, err + } + timeout := cfg.HTTPTimeout + if timeout == 0 { + timeout = 30 * time.Second + } + hc := &http.Client{ + Timeout: timeout, + Transport: &http.Transport{TLSClientConfig: tlsCfg}, + } + return &Client{ + base: strings.TrimRight(cfg.Endpoint, "/") + "/api2/json", + node: cfg.Node, + token: cfg.Token, + http: hc, + }, nil +} + +// Node returns the configured node name. +func (c *Client) Node() string { return c.node } + +// get performs GET and decodes the {"data": ...} envelope into out. +func (c *Client) get(ctx context.Context, path string, out any) error { + return c.do(ctx, http.MethodGet, path, nil, out) +} + +// postForm performs a form-encoded POST/PUT and decodes the envelope into out. +// out may be nil when the caller does not need the body. +func (c *Client) postForm(ctx context.Context, method, path string, params url.Values, out any) error { + var body io.Reader + if params != nil { + body = strings.NewReader(params.Encode()) + } + return c.doBody(ctx, method, path, body, "application/x-www-form-urlencoded", out) +} + +func (c *Client) do(ctx context.Context, method, path string, body io.Reader, out any) error { + return c.doBody(ctx, method, path, body, "", out) +} + +// doBody is the single HTTP chokepoint: builds the request, sets auth, executes, +// maps non-2xx to APIError, and decodes the data envelope. +func (c *Client) doBody(ctx context.Context, method, path string, body io.Reader, contentType string, out any) error { + req, err := http.NewRequestWithContext(ctx, method, c.base+path, body) + if err != nil { + return fmt.Errorf("proxmox: building request: %w", err) + } + req.Header.Set("Authorization", "PVEAPIToken="+c.token) + req.Header.Set("Accept", "application/json") + if contentType != "" { + req.Header.Set("Content-Type", contentType) + } + + resp, err := c.http.Do(req) + if err != nil { + return fmt.Errorf("proxmox: %s %s: %w", method, path, err) + } + defer resp.Body.Close() + + raw, err := io.ReadAll(resp.Body) + if err != nil { + return fmt.Errorf("proxmox: reading %s %s response: %w", method, path, err) + } + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return newAPIError(resp.StatusCode, method, path, string(raw)) + } + if out == nil { + return nil + } + var env struct { + Data json.RawMessage `json:"data"` + } + if err := json.Unmarshal(raw, &env); err != nil { + return fmt.Errorf("proxmox: decoding %s %s envelope: %w", method, path, err) + } + if len(env.Data) == 0 || bytes.Equal(env.Data, []byte("null")) { + return nil // no data (e.g. a sync PUT /config) + } + if err := json.Unmarshal(env.Data, out); err != nil { + return fmt.Errorf("proxmox: decoding %s %s data: %w", method, path, err) + } + return nil +} + +// dataString runs a request expecting the "data" field to be a bare string +// (the UPID returned by async mutating ops). Returns "" with no error when the +// response carries no data (some sync ops). +func (c *Client) dataString(ctx context.Context, method, path string, params url.Values) (string, error) { + var s string + if err := c.postForm(ctx, method, path, params, &s); err != nil { + return "", err + } + return s, nil +} diff --git a/internal/proxmox/client_test.go b/internal/proxmox/client_test.go new file mode 100644 index 0000000..98c45f6 --- /dev/null +++ b/internal/proxmox/client_test.go @@ -0,0 +1,102 @@ +package proxmox + +import ( + "context" + "errors" + "net/http" + "testing" +) + +func TestAPIError_403ExtractsPrivilege(t *testing.T) { + d := &mockDoer{fn: func(r *http.Request) (*http.Response, error) { + return jsonResp(403, `{"message":"Permission check failed (/nodes/demo-felhom, Sys.Audit)\n"}`), nil + }} + _, err := newTestClient(d).NodeStatus(context.Background()) + var ae *APIError + if !errors.As(err, &ae) { + t.Fatalf("want *APIError, got %T: %v", err, err) + } + if !ae.IsForbidden() { + t.Errorf("IsForbidden = false") + } + if ae.Privilege != "Sys.Audit" { + t.Errorf("privilege = %q, want Sys.Audit", ae.Privilege) + } + if ae.DeniedPath != "/nodes/demo-felhom" { + t.Errorf("denied path = %q", ae.DeniedPath) + } +} + +func TestDecode_ListLXC(t *testing.T) { + // Exact shape captured from the live host. + body := `{"data":[{"cpu":0,"cpus":2,"disk":0,"maxdisk":10737418240,"maxmem":2147483648,"mem":0,"name":"spike-lxc","status":"stopped","type":"lxc","uptime":0,"vmid":9001}]}` + d := &mockDoer{fn: func(r *http.Request) (*http.Response, error) { return jsonResp(200, body), nil }} + gs, err := newTestClient(d).ListLXC(context.Background()) + if err != nil { + t.Fatalf("ListLXC: %v", err) + } + if len(gs) != 1 { + t.Fatalf("len = %d", len(gs)) + } + g := gs[0] + if g.VMID != 9001 || g.Name != "spike-lxc" || g.Status != "stopped" || g.CPUs != 2 { + t.Errorf("decoded guest wrong: %+v", g) + } +} + +func TestDecode_NodeStatus(t *testing.T) { + body := `{"data":{"cpu":0.0057,"uptime":73078,"loadavg":["0.11","0.09","0.05"],"pveversion":"pve-manager/9.2.2","memory":{"total":16537989120,"used":2043027456,"free":13587857408,"available":14494961664},"rootfs":{"total":100861726720,"used":4943888384,"free":95917838336,"avail":90747101184},"cpuinfo":{"cores":4,"cpus":4,"sockets":1,"model":"Intel(R) N100"}}}` + d := &mockDoer{fn: func(r *http.Request) (*http.Response, error) { return jsonResp(200, body), nil }} + s, err := newTestClient(d).NodeStatus(context.Background()) + if err != nil { + t.Fatalf("NodeStatus: %v", err) + } + if len(s.LoadAvg) != 3 || s.LoadAvg[0] != "0.11" { + t.Errorf("loadavg = %v", s.LoadAvg) + } + if s.Memory.Total != 16537989120 || s.CPUInfo.Cores != 4 { + t.Errorf("decoded node status wrong: %+v", s) + } +} + +func TestDecode_GuestConfig_FeaturesAndExtra(t *testing.T) { + // keyctl must survive as a string; mpN/netN land in Extra. + body := `{"data":{"arch":"amd64","cores":2,"features":"nesting=1,keyctl=1","hostname":"spike-lxc","memory":2048,"net0":"name=eth0,bridge=vmbr0,hwaddr=BC:24:11:D1:6D:CB,ip=dhcp,type=veth","rootfs":"local-lvm:vm-9001-disk-0,size=10G","unprivileged":1,"mp0":"local-lvm:1,mp=/mnt/bulk,backup=0"}}` + d := &mockDoer{fn: func(r *http.Request) (*http.Response, error) { return jsonResp(200, body), nil }} + cfg, err := newTestClient(d).GuestConfig(context.Background(), 9001) + if err != nil { + t.Fatalf("GuestConfig: %v", err) + } + if cfg.Features != "nesting=1,keyctl=1" { + t.Errorf("features = %q", cfg.Features) + } + if cfg.Unprivileged != 1 { + t.Errorf("unprivileged = %d", cfg.Unprivileged) + } + if mp := cfg.MountPoints(); mp["mp0"] != "local-lvm:1,mp=/mnt/bulk,backup=0" { + t.Errorf("mountpoints = %v", mp) + } + if nets := cfg.Nets(); nets["net0"] == "" { + t.Errorf("nets = %v", nets) + } + // "memory" must NOT be misread as an mp/net prefix match. + if mp := cfg.MountPoints(); len(mp) != 1 { + t.Errorf("expected exactly 1 mountpoint, got %v", mp) + } +} + +func TestDataString_ReturnsUPID(t *testing.T) { + d := &mockDoer{fn: func(r *http.Request) (*http.Response, error) { + if r.Method != http.MethodPost { + t.Errorf("method = %s", r.Method) + } + return jsonResp(200, `{"data":"`+testUPID+`"}`), nil + }} + upid, err := newTestClient(d).Snapshot(context.Background(), 9001, "s1", "") + if err != nil { + t.Fatalf("Snapshot: %v", err) + } + if upid != testUPID { + t.Errorf("upid = %q", upid) + } +} diff --git a/internal/proxmox/doc.go b/internal/proxmox/doc.go new file mode 100644 index 0000000..6d82043 --- /dev/null +++ b/internal/proxmox/doc.go @@ -0,0 +1,62 @@ +// Package proxmox is the typed interaction layer the host agent uses to talk to +// a single Proxmox VE host. Every other agent module calls this package; it owns +// the API-first + fenced-root-CLI model the spikes proved +// (felhom.eu/documentation/proxmox-platform.md and tests/phase{0,1-2,3}-findings.md). +// +// # Two backends, one routing policy +// +// The package has two independent backends. Which path an operation takes is a +// fixed policy, not a per-call choice: +// +// - Client (API backend) — the default for everything the scoped FelhomAgent +// token can do. A hand-rolled REST client over https://:8006/api2/json, +// auth header "Authorization: PVEAPIToken=USER@REALM!TOKENID=SECRET". Every +// mutating call is async: it returns a UPID and the caller polls the task with +// WaitTask until it stops, then asserts exitstatus == "OK". Authorization can +// surface at task execution, not the HTTP POST (phase1-2 §1.3) — so the POST's +// 200 is never trusted. +// +// - Privileged (root-CLI backend) — fenced to the three proven exceptions ONLY: +// (a) keyctl `pct create` for golden-image builds, (b) USB mount-by-UUID / +// fstab, (c) SMART / sensors reads. Each method cites why it cannot be the API. +// +// Client never shells out and Privileged never makes an HTTP call: the fence is +// structural (separate types, separate dependencies), and asserted in +// routing_test.go. +// +// # API-vs-root routing table (phase3-findings.md §B3 boundary) +// +// Operation Backend Why +// ------------------------------------------------- ----------- ---------------------------------- +// node status / resources / metrics Client (API) Sys.Audit +// list guests + per-guest status/config Client (API) VM.Audit +// storage list + content Client (API) Datastore.Audit +// task status / log Client (API) task owner can read own task +// restore LXC from archive (PRIMARY create path) Client (API) VM.Allocate; restore preserves keyctl +// vzdump backup (stop/snapshot mode) Client (API) VM.Backup (stop-mode needs no PowerMgmt) +// snapshot / rollback / delete-snapshot Client (API) VM.Snapshot / VM.Snapshot.Rollback +// set config (mem/cpu/net/options/mountpoint) Client (API) VM.Config.* +// start / stop guest Client (API) VM.PowerMgmt +// ------------------------------------------------- ----------- ---------------------------------- +// golden-image `pct create` with keyctl=1 Privileged keyctl is root@pam-only; no token qualifies +// USB mount-by-UUID / systemd mount unit / fstab Privileged host-level mount, not a Proxmox API op +// SMART / hardware sensors Privileged not API-exposed +// +// # Grounding notes for later slices (do not act on these here) +// +// - Provision-by-restore is the primary create path: a token-authorized restore +// preserves features=nesting=1,keyctl=1 (phase3 §B3); fresh `pct create` with +// keyctl is the only root-fenced create. +// - A Docker NAMED volume lives in the LXC rootfs (/var/lib/docker/volumes//_data) +// and is ALWAYS captured by vzdump. The backup= flag is honoured only for +// *volume* mount points; a bulk volume must be a dedicated backup=0 mountpoint or +// it is silently swept into the whole-guest image (phase3 §B2). +// - `pct restore` preserves the source MAC + hostname — reset network identity +// before starting alongside the original (phase1-2 §2.2). +// - An LXC has no guest agent, so snapshot-mode vzdump does NOT fsfreeze: an +// agent-initiated backup is crash-consistent only; app-consistency is the +// controller's job (quiesce, then POST /backup) (proxmox-platform.md §4.2). +// +// This slice (slice 1) wraps only the proven, read-tested op set. No reconcile +// loop, hub client, or signing — those are later slices. +package proxmox diff --git a/internal/proxmox/errors.go b/internal/proxmox/errors.go new file mode 100644 index 0000000..dd5ddce --- /dev/null +++ b/internal/proxmox/errors.go @@ -0,0 +1,81 @@ +package proxmox + +import ( + "fmt" + "regexp" +) + +// permRe extracts the offending privilege (and path) from a Proxmox permission +// message, e.g. "Permission check failed (/vms/9000, VM.Backup)" or +// "403 Permission check failed (/sdn/zones/localnetwork/vmbr0, SDN.Use)". +var permRe = regexp.MustCompile(`Permission check failed \(([^,]+),\s*([^)]+)\)`) + +// APIError is returned for a non-2xx HTTP response from the Proxmox API. On a 403 +// it parses the offending path + privilege so a role misconfiguration is +// diagnosable (the FelhomAgent role is exactly 16 privileges — see doc.go). +type APIError struct { + StatusCode int + Method string + Path string // request path + Body string // response body (trimmed) + // Populated from a permission-check message when present: + DeniedPath string // ACL path, e.g. "/vms/9000" + Privilege string // e.g. "VM.Backup" +} + +func (e *APIError) Error() string { + if e.Privilege != "" { + return fmt.Sprintf("proxmox: %s %s -> HTTP %d: permission denied at %s (missing privilege %s)", + e.Method, e.Path, e.StatusCode, e.DeniedPath, e.Privilege) + } + return fmt.Sprintf("proxmox: %s %s -> HTTP %d: %s", e.Method, e.Path, e.StatusCode, e.Body) +} + +// IsForbidden reports whether this was an HTTP 403. +func (e *APIError) IsForbidden() bool { return e.StatusCode == 403 } + +// newAPIError builds an APIError, extracting privilege info from the body. +func newAPIError(statusCode int, method, path, body string) *APIError { + e := &APIError{StatusCode: statusCode, Method: method, Path: path, Body: trimBody(body)} + if m := permRe.FindStringSubmatch(body); m != nil { + e.DeniedPath = m[1] + e.Privilege = m[2] + } + return e +} + +// TaskError is returned by WaitTask when a task stops with a non-OK exitstatus. +// The authorization failure for a mutating op surfaces here (in the task +// exitstatus), not at the HTTP POST — so callers must always WaitTask. +type TaskError struct { + UPID string + ExitStatus string // e.g. "403 Permission check failed (/vms/9000, VM.Backup)" + LogTail []string // last lines of the task log, for diagnosis + DeniedPath string + Privilege string +} + +func (e *TaskError) Error() string { + if e.Privilege != "" { + return fmt.Sprintf("proxmox: task %s failed: permission denied at %s (missing privilege %s)", + e.UPID, e.DeniedPath, e.Privilege) + } + return fmt.Sprintf("proxmox: task %s failed: exitstatus %q", e.UPID, e.ExitStatus) +} + +func newTaskError(upid, exitStatus string, logTail []string) *TaskError { + e := &TaskError{UPID: upid, ExitStatus: exitStatus, LogTail: logTail} + if m := permRe.FindStringSubmatch(exitStatus); m != nil { + e.DeniedPath = m[1] + e.Privilege = m[2] + } + return e +} + +func trimBody(s string) string { + const max = 512 + if len(s) > max { + return s[:max] + "…" + } + return s +} diff --git a/internal/proxmox/mock_test.go b/internal/proxmox/mock_test.go new file mode 100644 index 0000000..0bb8665 --- /dev/null +++ b/internal/proxmox/mock_test.go @@ -0,0 +1,50 @@ +package proxmox + +import ( + "context" + "io" + "net/http" + "strings" +) + +// mockDoer is an injectable HTTP transport for the API client. It records call +// count and routes each request to fn. +type mockDoer struct { + calls int + fn func(*http.Request) (*http.Response, error) +} + +func (m *mockDoer) Do(r *http.Request) (*http.Response, error) { + m.calls++ + return m.fn(r) +} + +// jsonResp builds an HTTP response with a JSON body. +func jsonResp(code int, body string) *http.Response { + return &http.Response{ + StatusCode: code, + Body: io.NopCloser(strings.NewReader(body)), + Header: http.Header{"Content-Type": []string{"application/json"}}, + } +} + +// newTestClient wraps a mockDoer in a Client (bypassing NewClient's real transport). +func newTestClient(d doer) *Client { + return &Client{base: "https://host:8006/api2/json", node: "demo-felhom", token: "u@pve!t=secret", http: d} +} + +// mockRunner records privileged command invocations and returns canned output. +type mockRunner struct { + calls int + lastCmd string + lastArg []string + out []byte + err error +} + +func (m *mockRunner) Run(_ context.Context, name string, args ...string) ([]byte, []byte, error) { + m.calls++ + m.lastCmd = name + m.lastArg = args + return m.out, nil, m.err +} diff --git a/internal/proxmox/mutate.go b/internal/proxmox/mutate.go new file mode 100644 index 0000000..c40c441 --- /dev/null +++ b/internal/proxmox/mutate.go @@ -0,0 +1,148 @@ +package proxmox + +import ( + "context" + "fmt" + "net/http" + "net/url" + "strconv" +) + +// Async mutating operations. Each is API-token-covered (the FelhomAgent role) and +// returns a UPID string; the caller MUST WaitTask on it and assert exitstatus OK. +// The HTTP 200 here is not proof of success (phase1-2 §1.3). + +// BackupMode is the vzdump mode. +type BackupMode string + +const ( + // ModeStop: orderly guest shutdown -> backup -> restart. Highest consistency. + // For LXC the shutdown/restart is internal to vzdump and needs only VM.Backup + // (NOT VM.PowerMgmt) — phase1-2 §1.4. + ModeStop BackupMode = "stop" + // ModeSnapshot: lowest downtime; for an LXC this is crash-consistent only (no + // fsfreeze) — app-consistency is the controller's job (proxmox-platform.md §4.2). + ModeSnapshot BackupMode = "snapshot" +) + +// RestoreLXCOptions parameterizes a restore. This is the PRIMARY create path: +// a token-authorized restore preserves features=nesting=1,keyctl=1 from the +// archive, so it needs no root (phase3 §B3). Fresh `pct create` with keyctl is +// the only root-fenced create (see Privileged.CreateGoldenLXC). +type RestoreLXCOptions struct { + VMID int // target VMID (fresh id) + Archive string // source archive volid, e.g. "local:backup/vzdump-lxc-9001-...tar.zst" + Storage string // target storage for the rootfs, e.g. "local-lvm" + Force bool // overwrite an existing VMID (destructive — caller must have authority) +} + +// RestoreLXC restores an LXC from a vzdump/PBS archive via POST /nodes/{node}/lxc +// (restore=1). Returns the UPID. NOTE: pct restore preserves the source MAC + +// hostname — reset network identity before starting alongside the original +// (phase1-2 §2.2). Identity reset is a SetConfig call the caller makes after. +func (c *Client) RestoreLXC(ctx context.Context, opts RestoreLXCOptions) (string, error) { + if opts.VMID == 0 || opts.Archive == "" || opts.Storage == "" { + return "", fmt.Errorf("proxmox: RestoreLXC needs vmid, archive and storage") + } + v := url.Values{} + v.Set("vmid", strconv.Itoa(opts.VMID)) + v.Set("ostemplate", opts.Archive) // pct restore source + v.Set("restore", "1") + v.Set("storage", opts.Storage) + if opts.Force { + v.Set("force", "1") + } + return c.dataString(ctx, http.MethodPost, "/nodes/"+c.node+"/lxc", v) +} + +// VzdumpOptions parameterizes a backup. +type VzdumpOptions struct { + VMID int + Storage string // a storage whose content includes "backup" (e.g. "local") — NOT local-lvm + Mode BackupMode // ModeStop | ModeSnapshot + Compress string // "zstd" (default), "lzo", "gzip", or "" for none +} + +// Vzdump starts a backup via POST /nodes/{node}/vzdump. Returns the UPID. An +// agent-initiated vzdump is crash-consistent only for an LXC (no fsfreeze). +func (c *Client) Vzdump(ctx context.Context, opts VzdumpOptions) (string, error) { + if opts.VMID == 0 || opts.Storage == "" || opts.Mode == "" { + return "", fmt.Errorf("proxmox: Vzdump needs vmid, storage and mode") + } + v := url.Values{} + v.Set("vmid", strconv.Itoa(opts.VMID)) + v.Set("storage", opts.Storage) + v.Set("mode", string(opts.Mode)) + if opts.Compress == "" { + opts.Compress = "zstd" + } + v.Set("compress", opts.Compress) + return c.dataString(ctx, http.MethodPost, "/nodes/"+c.node+"/vzdump", v) +} + +// Snapshot creates an LXC snapshot via POST /nodes/{node}/lxc/{vmid}/snapshot. +// A running, unprivileged LXC can be snapshotted on LVM-thin with no stop +// (phase1-2 §1.6) — this is the snapshot-before-change primitive. +func (c *Client) Snapshot(ctx context.Context, vmid int, snapname, description string) (string, error) { + if vmid == 0 || snapname == "" { + return "", fmt.Errorf("proxmox: Snapshot needs vmid and snapname") + } + v := url.Values{} + v.Set("snapname", snapname) + if description != "" { + v.Set("description", description) + } + path := fmt.Sprintf("/nodes/%s/lxc/%d/snapshot", c.node, vmid) + return c.dataString(ctx, http.MethodPost, path, v) +} + +// Rollback rolls an LXC back to a snapshot via +// POST /nodes/{node}/lxc/{vmid}/snapshot/{snap}/rollback. +func (c *Client) Rollback(ctx context.Context, vmid int, snapname string) (string, error) { + if vmid == 0 || snapname == "" { + return "", fmt.Errorf("proxmox: Rollback needs vmid and snapname") + } + path := fmt.Sprintf("/nodes/%s/lxc/%d/snapshot/%s/rollback", c.node, vmid, url.PathEscape(snapname)) + return c.dataString(ctx, http.MethodPost, path, url.Values{}) +} + +// DeleteSnapshot removes an LXC snapshot via +// DELETE /nodes/{node}/lxc/{vmid}/snapshot/{snap}. +func (c *Client) DeleteSnapshot(ctx context.Context, vmid int, snapname string) (string, error) { + if vmid == 0 || snapname == "" { + return "", fmt.Errorf("proxmox: DeleteSnapshot needs vmid and snapname") + } + path := fmt.Sprintf("/nodes/%s/lxc/%d/snapshot/%s", c.node, vmid, url.PathEscape(snapname)) + return c.dataString(ctx, http.MethodDelete, path, nil) +} + +// SetConfig applies config changes via PUT /nodes/{node}/lxc/{vmid}/config +// (e.g. memory, cores, net0, mpN with a backup flag). PVE may apply this +// synchronously (no UPID) — the returned string is empty in that case, and "" is +// not an error. When a UPID is returned, WaitTask on it. +// +// Identity reset after a restore (phase1-2 §2.2) is a SetConfig with +// params{"net0": "name=eth0,bridge=vmbr0,ip=dhcp"} (regenerates the MAC). +func (c *Client) SetConfig(ctx context.Context, vmid int, params map[string]string) (string, error) { + if vmid == 0 || len(params) == 0 { + return "", fmt.Errorf("proxmox: SetConfig needs vmid and at least one param") + } + v := url.Values{} + for k, val := range params { + v.Set(k, val) + } + path := fmt.Sprintf("/nodes/%s/lxc/%d/config", c.node, vmid) + return c.dataString(ctx, http.MethodPut, path, v) +} + +// Start starts a guest via POST /nodes/{node}/lxc/{vmid}/status/start (VM.PowerMgmt). +func (c *Client) Start(ctx context.Context, vmid int) (string, error) { + path := fmt.Sprintf("/nodes/%s/lxc/%d/status/start", c.node, vmid) + return c.dataString(ctx, http.MethodPost, path, url.Values{}) +} + +// Stop stops a guest via POST /nodes/{node}/lxc/{vmid}/status/stop (VM.PowerMgmt). +func (c *Client) Stop(ctx context.Context, vmid int) (string, error) { + path := fmt.Sprintf("/nodes/%s/lxc/%d/status/stop", c.node, vmid) + return c.dataString(ctx, http.MethodPost, path, url.Values{}) +} diff --git a/internal/proxmox/privileged.go b/internal/proxmox/privileged.go new file mode 100644 index 0000000..c5bcf35 --- /dev/null +++ b/internal/proxmox/privileged.go @@ -0,0 +1,203 @@ +package proxmox + +import ( + "context" + "encoding/json" + "fmt" + "os/exec" + "strconv" +) + +// The Privileged backend is fenced to the THREE proven OS-root exceptions only +// (phase3 §B3 boundary, doc.go routing table): +// +// (a) keyctl `pct create` for golden-image builds, +// (b) USB mount-by-UUID / fstab, +// (c) SMART / sensors reads. +// +// It runs host commands through a Runner (direct exec or sudo). It makes NO HTTP +// call — the fence between API ops and root ops is structural: Client owns the +// API, Privileged owns the shell. routing_test.go asserts neither crosses over. +// +// Everything else — the entire guest lifecycle including restore — goes through +// the API Client. Do NOT add non-exception methods here. + +// Runner executes a host command and returns its stdout/stderr. *ExecRunner is the +// production implementation; tests inject a mock to assert which commands ran. +type Runner interface { + Run(ctx context.Context, name string, args ...string) (stdout, stderr []byte, err error) +} + +// RunnerMode selects how privileged commands are executed. +type RunnerMode string + +const ( + // RunnerDirect: exec the binary directly (agent already runs as root — not the + // recommended uid model, see README; useful in dev/CI). + RunnerDirect RunnerMode = "direct" + // RunnerSudo: prefix with sudo (the intended model — agent runs as a non-root + // service user with a narrow sudoers allowlist, 03 §3/§12). + RunnerSudo RunnerMode = "sudo" +) + +// ExecRunner runs commands via os/exec, optionally through sudo. +type ExecRunner struct { + Mode RunnerMode + SudoPath string // defaults to "sudo" when Mode == RunnerSudo +} + +// Run implements Runner. +func (r *ExecRunner) Run(ctx context.Context, name string, args ...string) ([]byte, []byte, error) { + var cmd *exec.Cmd + if r.Mode == RunnerSudo { + sudo := r.SudoPath + if sudo == "" { + sudo = "sudo" + } + cmd = exec.CommandContext(ctx, sudo, append([]string{"-n", name}, args...)...) + } else { + cmd = exec.CommandContext(ctx, name, args...) + } + var stdout, stderr capBuf + cmd.Stdout = &stdout + cmd.Stderr = &stderr + err := cmd.Run() + return stdout.b, stderr.b, err +} + +// Privileged is the root-CLI backend. +type Privileged struct { + runner Runner + node string +} + +// NewPrivileged builds the fenced root backend. +func NewPrivileged(runner Runner, node string) *Privileged { + return &Privileged{runner: runner, node: node} +} + +// GoldenLXCSpec describes a golden-base CT to build fresh. +type GoldenLXCSpec struct { + VMID int + OSTemplate string // CT template volid, e.g. "local:vztmpl/debian-13-standard_..._amd64.tar.zst" + Storage string // rootfs storage, e.g. "local-lvm" + RootFSGB int + Cores int + MemoryMB int + Hostname string + // Features is forced to "nesting=1,keyctl=1" — keyctl is exactly why this is + // root-fenced. +} + +// CreateGoldenLXC builds a Docker-capable golden base CT with keyctl=1. +// +// WHY THIS CANNOT BE THE API: setting feature flags other than `nesting` on +// create is `root@pam`-only — `changing feature flags (except nesting) is only +// allowed for root@pam`. No API token qualifies, not even a non-privsep root@pam +// token (same 403). This is the ONLY root-fenced create; the per-customer path +// provisions by restore, which preserves keyctl with no root (phase3 §B3). +// +// This is a one-time/maintenance op at enrollment (03 §9), off the per-customer path. +func (p *Privileged) CreateGoldenLXC(ctx context.Context, spec GoldenLXCSpec) error { + if spec.VMID == 0 || spec.OSTemplate == "" || spec.Storage == "" { + return fmt.Errorf("proxmox: CreateGoldenLXC needs vmid, ostemplate and storage") + } + rootfs := spec.Storage + if spec.RootFSGB > 0 { + rootfs = fmt.Sprintf("%s:%d", spec.Storage, spec.RootFSGB) + } + args := []string{ + "create", strconv.Itoa(spec.VMID), spec.OSTemplate, + "--unprivileged", "1", + "--features", "nesting=1,keyctl=1", + "--rootfs", rootfs, + } + if spec.Cores > 0 { + args = append(args, "--cores", strconv.Itoa(spec.Cores)) + } + if spec.MemoryMB > 0 { + args = append(args, "--memory", strconv.Itoa(spec.MemoryMB)) + } + if spec.Hostname != "" { + args = append(args, "--hostname", spec.Hostname) + } + return p.run(ctx, "pct", args...) +} + +// MountUSBByUUID mounts a filesystem by UUID at target (creating the mountpoint). +// +// WHY THIS CANNOT BE THE API: a physical host mount is not a Proxmox API op; it is +// a host-level mount handled by OS root / a narrow sudoers entry (phase3 §B3). +// fstab persistence is a later-slice concern (03 §7 storage manifest). +func (p *Privileged) MountUSBByUUID(ctx context.Context, uuid, target string) error { + if uuid == "" || target == "" { + return fmt.Errorf("proxmox: MountUSBByUUID needs uuid and target") + } + if err := p.run(ctx, "mkdir", "-p", target); err != nil { + return err + } + return p.run(ctx, "mount", "UUID="+uuid, target) +} + +// SMART returns parsed `smartctl -a -j` JSON for a device. +// +// WHY THIS CANNOT BE THE API: disk SMART data is not exposed by the Proxmox API; +// it is read with OS root via smartctl (phase3 §B3). +func (p *Privileged) SMART(ctx context.Context, device string) (map[string]any, error) { + if device == "" { + return nil, fmt.Errorf("proxmox: SMART needs a device") + } + out, stderr, err := p.runner.Run(ctx, "smartctl", "-a", "-j", device) + if err != nil { + // smartctl uses nonzero exit codes as bitmask warnings even on success; + // trust parseable JSON output over the exit code. + if len(out) == 0 { + return nil, fmt.Errorf("proxmox: smartctl %s: %w: %s", device, err, stderr) + } + } + var m map[string]any + if err := json.Unmarshal(out, &m); err != nil { + return nil, fmt.Errorf("proxmox: parsing smartctl JSON: %w", err) + } + return m, nil +} + +// Sensors returns parsed `sensors -j` JSON (hardware temperatures/fans). +// +// WHY THIS CANNOT BE THE API: hardware sensors are not API-exposed (phase3 §B3). +func (p *Privileged) Sensors(ctx context.Context) (map[string]any, error) { + out, stderr, err := p.runner.Run(ctx, "sensors", "-j") + if err != nil && len(out) == 0 { + return nil, fmt.Errorf("proxmox: sensors: %w: %s", err, stderr) + } + var m map[string]any + if err := json.Unmarshal(out, &m); err != nil { + return nil, fmt.Errorf("proxmox: parsing sensors JSON: %w", err) + } + return m, nil +} + +// run executes a command and wraps a nonzero exit with its stderr. +func (p *Privileged) run(ctx context.Context, name string, args ...string) error { + _, stderr, err := p.runner.Run(ctx, name, args...) + if err != nil { + return fmt.Errorf("proxmox: %s %v: %w: %s", name, args, err, trimBody(string(stderr))) + } + return nil +} + +// capBuf is a tiny capped buffer so a runaway command can't blow memory. +type capBuf struct{ b []byte } + +func (c *capBuf) Write(p []byte) (int, error) { + const max = 1 << 20 // 1 MiB + if len(c.b) < max { + room := max - len(c.b) + if room >= len(p) { + c.b = append(c.b, p...) + } else { + c.b = append(c.b, p[:room]...) + } + } + return len(p), nil // always report full consumption +} diff --git a/internal/proxmox/query.go b/internal/proxmox/query.go new file mode 100644 index 0000000..60aa082 --- /dev/null +++ b/internal/proxmox/query.go @@ -0,0 +1,78 @@ +package proxmox + +import ( + "context" + "fmt" + "net/url" +) + +// Read-only query operations. All API-backed (Datastore.Audit / VM.Audit / +// Sys.Audit). These are what `felhom-agent --selftest` exercises against a live +// host — they mutate nothing. + +// Version returns GET /version. +func (c *Client) Version(ctx context.Context) (Version, error) { + var v Version + return v, c.get(ctx, "/version", &v) +} + +// Nodes returns GET /nodes. Use this to confirm the node name and read each +// node's ssl_fingerprint (which is what to pin in TLSConfig). +func (c *Client) Nodes(ctx context.Context) ([]Node, error) { + var ns []Node + return ns, c.get(ctx, "/nodes", &ns) +} + +// NodeStatus returns GET /nodes/{node}/status (host metrics; needs Sys.Audit). +func (c *Client) NodeStatus(ctx context.Context) (NodeStatus, error) { + var s NodeStatus + return s, c.get(ctx, "/nodes/"+c.node+"/status", &s) +} + +// ListLXC returns GET /nodes/{node}/lxc (the guests on this node). +func (c *Client) ListLXC(ctx context.Context) ([]Guest, error) { + var gs []Guest + return gs, c.get(ctx, "/nodes/"+c.node+"/lxc", &gs) +} + +// GuestStatus returns GET /nodes/{node}/lxc/{vmid}/status/current. The API body +// has no vmid field (it is in the path), so it is set from the argument. +func (c *Client) GuestStatus(ctx context.Context, vmid int) (Guest, error) { + var g Guest + path := fmt.Sprintf("/nodes/%s/lxc/%d/status/current", c.node, vmid) + if err := c.get(ctx, path, &g); err != nil { + return Guest{}, err + } + g.VMID = vmid + return g, nil +} + +// GuestConfig returns GET /nodes/{node}/lxc/{vmid}/config. +func (c *Client) GuestConfig(ctx context.Context, vmid int) (GuestConfig, error) { + var cfg GuestConfig + path := fmt.Sprintf("/nodes/%s/lxc/%d/config", c.node, vmid) + return cfg, c.get(ctx, path, &cfg) +} + +// ListStorage returns GET /storage (cluster-wide storage definitions). +func (c *Client) ListStorage(ctx context.Context) ([]Storage, error) { + var ss []Storage + return ss, c.get(ctx, "/storage", &ss) +} + +// NodeStorage returns GET /nodes/{node}/storage (storage with live usage). +func (c *Client) NodeStorage(ctx context.Context) ([]Storage, error) { + var ss []Storage + return ss, c.get(ctx, "/nodes/"+c.node+"/storage", &ss) +} + +// StorageContent returns GET /nodes/{node}/storage/{store}/content (e.g. vzdump +// archives + CT templates available for a restore). +func (c *Client) StorageContent(ctx context.Context, store string) ([]StorageContent, error) { + var cs []StorageContent + path := fmt.Sprintf("/nodes/%s/storage/%s/content", c.node, url.PathEscape(store)) + return cs, c.get(ctx, path, &cs) +} + +// urlEscape escapes a path segment (a UPID contains ':' and '@'). +func urlEscape(s string) string { return url.PathEscape(s) } diff --git a/internal/proxmox/routing_test.go b/internal/proxmox/routing_test.go new file mode 100644 index 0000000..de6f167 --- /dev/null +++ b/internal/proxmox/routing_test.go @@ -0,0 +1,97 @@ +package proxmox + +import ( + "context" + "net/http" + "testing" +) + +// TestRouting_APIOpsNeverShellOut asserts the API path never invokes the +// privileged runner: API ops (read + mutating) go only through the HTTP doer. +func TestRouting_APIOpsNeverShellOut(t *testing.T) { + runner := &mockRunner{} + // If any API op tried to use a runner, it would have to be wired here — it + // cannot be, because Client has no runner field. We still assert structurally: + // run a batch of API ops with a recording doer and confirm the runner is idle. + d := &mockDoer{fn: func(r *http.Request) (*http.Response, error) { + // Generic OK responses sufficient for the calls below. + if r.Method == http.MethodGet { + return jsonResp(200, `{"data":[]}`), nil + } + return jsonResp(200, `{"data":"`+testUPID+`"}`), nil + }} + c := newTestClient(d) + ctx := context.Background() + + _, _ = c.Version(ctx) + _, _ = c.Nodes(ctx) + _, _ = c.ListLXC(ctx) + _, _ = c.NodeStorage(ctx) + _, _ = c.Snapshot(ctx, 9001, "s1", "") + _, _ = c.Rollback(ctx, 9001, "s1") + _, _ = c.Vzdump(ctx, VzdumpOptions{VMID: 9001, Storage: "local", Mode: ModeStop}) + _, _ = c.RestoreLXC(ctx, RestoreLXCOptions{VMID: 9100, Archive: "local:backup/a.tar.zst", Storage: "local-lvm"}) + _, _ = c.Start(ctx, 9001) + _, _ = c.Stop(ctx, 9001) + + if runner.calls != 0 { + t.Fatalf("API ops invoked the privileged runner %d time(s) — fence broken", runner.calls) + } + if d.calls == 0 { + t.Fatalf("expected API ops to use the HTTP doer") + } +} + +// TestRouting_PrivilegedOpsNeverHTTP asserts the fenced root path never makes an +// HTTP call: Privileged ops go only through the runner. +func TestRouting_PrivilegedOpsNeverHTTP(t *testing.T) { + d := &mockDoer{fn: func(r *http.Request) (*http.Response, error) { + t.Fatalf("privileged op made an HTTP call to %s — fence broken", r.URL) + return nil, nil + }} + _ = d // a Privileged has no doer field; this doer is unreachable by construction. + + runner := &mockRunner{out: []byte(`{"ok":true}`)} + p := NewPrivileged(runner, "demo-felhom") + ctx := context.Background() + + if err := p.CreateGoldenLXC(ctx, GoldenLXCSpec{VMID: 9999, OSTemplate: "local:vztmpl/x.tar.zst", Storage: "local-lvm"}); err != nil { + t.Fatalf("CreateGoldenLXC: %v", err) + } + if err := p.MountUSBByUUID(ctx, "1234-ABCD", "/mnt/usb"); err != nil { + t.Fatalf("MountUSBByUUID: %v", err) + } + if _, err := p.SMART(ctx, "/dev/sda"); err != nil { + t.Fatalf("SMART: %v", err) + } + if _, err := p.Sensors(ctx); err != nil { + t.Fatalf("Sensors: %v", err) + } + if runner.calls == 0 { + t.Fatalf("expected privileged ops to use the runner") + } +} + +// TestPrivileged_CreateGoldenForcesKeyctl asserts the golden create always carries +// the keyctl feature flag (the whole reason it is root-fenced). +func TestPrivileged_CreateGoldenForcesKeyctl(t *testing.T) { + runner := &mockRunner{} + p := NewPrivileged(runner, "demo-felhom") + if err := p.CreateGoldenLXC(context.Background(), GoldenLXCSpec{ + VMID: 9999, OSTemplate: "local:vztmpl/x.tar.zst", Storage: "local-lvm", RootFSGB: 8, + }); err != nil { + t.Fatalf("CreateGoldenLXC: %v", err) + } + if runner.lastCmd != "pct" { + t.Errorf("cmd = %q, want pct", runner.lastCmd) + } + var sawFeatures bool + for i, a := range runner.lastArg { + if a == "--features" && i+1 < len(runner.lastArg) && runner.lastArg[i+1] == "nesting=1,keyctl=1" { + sawFeatures = true + } + } + if !sawFeatures { + t.Errorf("pct create args missing keyctl features: %v", runner.lastArg) + } +} diff --git a/internal/proxmox/task.go b/internal/proxmox/task.go new file mode 100644 index 0000000..abf91ec --- /dev/null +++ b/internal/proxmox/task.go @@ -0,0 +1,141 @@ +package proxmox + +import ( + "context" + "fmt" + "time" +) + +// TaskStatus is GET /nodes/{node}/tasks/{upid}/status. While the task runs, +// Status == "running" and ExitStatus is empty; once it stops, Status == "stopped" +// and ExitStatus is "OK" or an error string (e.g. a 403 permission message). +type TaskStatus struct { + UPID string `json:"upid"` + ID string `json:"id"` + Node string `json:"node"` + Type string `json:"type"` + User string `json:"user"` + Status string `json:"status"` // "running" | "stopped" + ExitStatus string `json:"exitstatus"` // present once stopped + PID int64 `json:"pid"` + StartTime int64 `json:"starttime"` +} + +// Running reports whether the task is still executing. +func (t TaskStatus) Running() bool { return t.Status == "running" } + +// OK reports whether the task stopped successfully. +func (t TaskStatus) OK() bool { return t.Status == "stopped" && t.ExitStatus == "OK" } + +// taskLogLine is one entry of GET /nodes/{node}/tasks/{upid}/log: {"n":N,"t":"..."}. +type taskLogLine struct { + N int `json:"n"` + T string `json:"t"` +} + +// WaitOptions tunes WaitTask polling. Zero value yields sane defaults. +type WaitOptions struct { + // Interval is the first poll gap (default 1s). + Interval time.Duration + // MaxInterval caps the backed-off gap (default 5s). + MaxInterval time.Duration + // Timeout bounds the whole wait (default 10m). Restore/vzdump can be slow; + // callers may raise it. A zero/elapsed context deadline also stops the wait. + Timeout time.Duration +} + +func (o WaitOptions) withDefaults() WaitOptions { + if o.Interval <= 0 { + o.Interval = 1 * time.Second + } + if o.MaxInterval <= 0 { + o.MaxInterval = 5 * time.Second + } + if o.Timeout <= 0 { + o.Timeout = 10 * time.Minute + } + return o +} + +// TaskStatusOnce fetches the current task status (one HTTP call). +func (c *Client) TaskStatusOnce(ctx context.Context, upid string) (TaskStatus, error) { + u, err := ParseUPID(upid) + if err != nil { + return TaskStatus{}, err + } + var st TaskStatus + path := fmt.Sprintf("/nodes/%s/tasks/%s/status", u.Node, urlEscape(upid)) + if err := c.get(ctx, path, &st); err != nil { + return TaskStatus{}, err + } + return st, nil +} + +// TaskLogTail fetches up to limit trailing log lines for a task (for diagnosis). +func (c *Client) TaskLogTail(ctx context.Context, upid string, limit int) ([]string, error) { + u, err := ParseUPID(upid) + if err != nil { + return nil, err + } + if limit <= 0 { + limit = 20 + } + var lines []taskLogLine + path := fmt.Sprintf("/nodes/%s/tasks/%s/log?limit=%d", u.Node, urlEscape(upid), limit) + if err := c.get(ctx, path, &lines); err != nil { + return nil, err + } + out := make([]string, 0, len(lines)) + for _, l := range lines { + out = append(out, l.T) + } + return out, nil +} + +// WaitTask polls a task until it stops, then asserts exitstatus == "OK". On any +// non-OK exit it returns a *TaskError carrying the exitstatus, the parsed +// privilege (if it was a permission failure), and a tail of the task log. +// +// This is the contract for EVERY mutating op: the POST's HTTP 200 is not proof of +// success — authorization can fail at task execution (phase1-2 §1.3). +func (c *Client) WaitTask(ctx context.Context, upid string, opts WaitOptions) (TaskStatus, error) { + opts = opts.withDefaults() + if _, err := ParseUPID(upid); err != nil { + return TaskStatus{}, err + } + + ctx, cancel := context.WithTimeout(ctx, opts.Timeout) + defer cancel() + + interval := opts.Interval + timer := time.NewTimer(0) // first poll immediately + defer timer.Stop() + + for { + select { + case <-ctx.Done(): + return TaskStatus{}, fmt.Errorf("proxmox: waiting for task %s: %w", upid, ctx.Err()) + case <-timer.C: + } + + st, err := c.TaskStatusOnce(ctx, upid) + if err != nil { + return TaskStatus{}, err + } + if st.Running() || st.Status == "" { + // back off, capped + interval *= 2 + if interval > opts.MaxInterval { + interval = opts.MaxInterval + } + timer.Reset(interval) + continue + } + // stopped + if st.ExitStatus == "OK" { + return st, nil + } + tail, _ := c.TaskLogTail(ctx, upid, 20) // best-effort + return st, newTaskError(upid, st.ExitStatus, tail) + } +} diff --git a/internal/proxmox/task_test.go b/internal/proxmox/task_test.go new file mode 100644 index 0000000..40fb6e3 --- /dev/null +++ b/internal/proxmox/task_test.go @@ -0,0 +1,81 @@ +package proxmox + +import ( + "context" + "errors" + "net/http" + "strings" + "testing" + "time" +) + +const testUPID = "UPID:demo-felhom:00026454:004E3431:6A265E53:vzsnapshot:9001:root@pam:" + +// fastWait keeps tests quick. +var fastWait = WaitOptions{Interval: time.Millisecond, MaxInterval: 2 * time.Millisecond, Timeout: time.Second} + +func TestWaitTask_RunningThenOK(t *testing.T) { + var n int + d := &mockDoer{fn: func(r *http.Request) (*http.Response, error) { + n++ + if n == 1 { + return jsonResp(200, `{"data":{"upid":"`+testUPID+`","status":"running"}}`), nil + } + return jsonResp(200, `{"data":{"upid":"`+testUPID+`","status":"stopped","exitstatus":"OK"}}`), nil + }} + st, err := newTestClient(d).WaitTask(context.Background(), testUPID, fastWait) + if err != nil { + t.Fatalf("WaitTask: %v", err) + } + if !st.OK() { + t.Errorf("status not OK: %+v", st) + } +} + +func TestWaitTask_FailedSurfacesPrivilege(t *testing.T) { + // vzdump against an unauthorized vmid: 200+UPID, then the 403 in exitstatus. + d := &mockDoer{fn: func(r *http.Request) (*http.Response, error) { + if strings.Contains(r.URL.Path, "/log") { + return jsonResp(200, `{"data":[{"n":1,"t":"TASK ERROR: 403 Permission check failed (/vms/9000, VM.Backup)"}]}`), nil + } + return jsonResp(200, `{"data":{"upid":"`+testUPID+`","status":"stopped","exitstatus":"403 Permission check failed (/vms/9000, VM.Backup)"}}`), nil + }} + _, err := newTestClient(d).WaitTask(context.Background(), testUPID, fastWait) + var te *TaskError + if !errors.As(err, &te) { + t.Fatalf("want *TaskError, got %T: %v", err, err) + } + if te.Privilege != "VM.Backup" { + t.Errorf("privilege = %q, want VM.Backup", te.Privilege) + } + if te.DeniedPath != "/vms/9000" { + t.Errorf("denied path = %q", te.DeniedPath) + } + if len(te.LogTail) == 0 { + t.Errorf("expected a log tail") + } +} + +func TestWaitTask_Timeout(t *testing.T) { + d := &mockDoer{fn: func(r *http.Request) (*http.Response, error) { + return jsonResp(200, `{"data":{"upid":"`+testUPID+`","status":"running"}}`), nil + }} + opts := WaitOptions{Interval: time.Millisecond, MaxInterval: time.Millisecond, Timeout: 30 * time.Millisecond} + _, err := newTestClient(d).WaitTask(context.Background(), testUPID, opts) + if err == nil || !errors.Is(err, context.DeadlineExceeded) { + t.Fatalf("want deadline-exceeded, got %v", err) + } +} + +func TestWaitTask_CtxCancel(t *testing.T) { + d := &mockDoer{fn: func(r *http.Request) (*http.Response, error) { + return jsonResp(200, `{"data":{"upid":"`+testUPID+`","status":"running"}}`), nil + }} + ctx, cancel := context.WithCancel(context.Background()) + go func() { time.Sleep(20 * time.Millisecond); cancel() }() + opts := WaitOptions{Interval: time.Millisecond, MaxInterval: time.Millisecond, Timeout: time.Minute} + _, err := newTestClient(d).WaitTask(ctx, testUPID, opts) + if err == nil || !errors.Is(err, context.Canceled) { + t.Fatalf("want canceled, got %v", err) + } +} diff --git a/internal/proxmox/tls.go b/internal/proxmox/tls.go new file mode 100644 index 0000000..b64ff1d --- /dev/null +++ b/internal/proxmox/tls.go @@ -0,0 +1,88 @@ +package proxmox + +import ( + "crypto/sha256" + "crypto/tls" + "crypto/x509" + "encoding/hex" + "fmt" + "os" + "strings" +) + +// TLSConfig describes how the client trusts the Proxmox host's certificate. The +// host serves a self-signed cert by default (proxmox-platform.md §3.1); we do NOT +// blanket-disable verification. Pick exactly one trust mechanism: +// +// - CAFile: path to a PEM bundle (the PVE CA / a real cert chain) — full verify. +// - Fingerprint: SHA-256 of the leaf cert (hex, colons optional). Verification is +// pinned to that exact cert — strong trust for a self-signed host without a CA. +// The /nodes API returns each node's ssl_fingerprint, which is what to pin. +// - InsecureSkipVerify: explicitly off by default. Only acceptable for a +// --selftest against 127.0.0.1; it is named honestly, not hidden behind a flag +// that sounds benign. +// +// If none is set, standard system verification applies (which will fail on a +// self-signed host — that is the safe default; the operator must pin). +type TLSConfig struct { + CAFile string + Fingerprint string + InsecureSkipVerify bool +} + +func (t TLSConfig) build() (*tls.Config, error) { + switch { + case t.InsecureSkipVerify: + // Caller opted in explicitly and by an honestly-named field. + return &tls.Config{InsecureSkipVerify: true}, nil //nolint:gosec // documented, config-gated, off by default + + case t.Fingerprint != "": + want, err := normalizeFingerprint(t.Fingerprint) + if err != nil { + return nil, err + } + // Pin to the leaf cert's SHA-256. We disable the default chain check (a + // self-signed cert has no CA) but enforce an exact-cert match instead, so + // this is pinning, not "skip verify". + return &tls.Config{ + InsecureSkipVerify: true, //nolint:gosec // replaced by the pin check below + VerifyPeerCertificate: func(rawCerts [][]byte, _ [][]*x509.Certificate) error { + if len(rawCerts) == 0 { + return fmt.Errorf("proxmox: TLS pin: peer presented no certificate") + } + got := sha256.Sum256(rawCerts[0]) + if hex.EncodeToString(got[:]) != want { + return fmt.Errorf("proxmox: TLS pin mismatch: server cert sha256 does not match configured fingerprint") + } + return nil + }, + }, nil + + case t.CAFile != "": + pem, err := os.ReadFile(t.CAFile) + if err != nil { + return nil, fmt.Errorf("proxmox: reading TLS CA file: %w", err) + } + pool := x509.NewCertPool() + if !pool.AppendCertsFromPEM(pem) { + return nil, fmt.Errorf("proxmox: TLS CA file %q contained no usable certificates", t.CAFile) + } + return &tls.Config{RootCAs: pool}, nil + + default: + return &tls.Config{}, nil // system roots; safe default + } +} + +// normalizeFingerprint lowercases and strips colons/whitespace, validating that +// the result is a 64-char (32-byte) hex SHA-256. +func normalizeFingerprint(fp string) (string, error) { + s := strings.ToLower(strings.NewReplacer(":", "", " ", "", "\t", "").Replace(fp)) + if len(s) != 64 { + return "", fmt.Errorf("proxmox: fingerprint must be a SHA-256 (64 hex chars), got %d", len(s)) + } + if _, err := hex.DecodeString(s); err != nil { + return "", fmt.Errorf("proxmox: fingerprint is not valid hex: %w", err) + } + return s, nil +} diff --git a/internal/proxmox/tls_test.go b/internal/proxmox/tls_test.go new file mode 100644 index 0000000..17b8417 --- /dev/null +++ b/internal/proxmox/tls_test.go @@ -0,0 +1,46 @@ +package proxmox + +import "testing" + +func TestNormalizeFingerprint(t *testing.T) { + // 64-hex with colons (the /nodes ssl_fingerprint form) normalizes fine. + const withColons = "BA:7C:99:7D:45:D0:67:91:E2:F2:72:74:6E:D6:9F:83:51:D1:61:E5:C3:BD:F6:A0:B8:0B:E3:D8:DB:89:5B:CF" + got, err := normalizeFingerprint(withColons) + if err != nil { + t.Fatalf("normalize: %v", err) + } + if len(got) != 64 { + t.Errorf("len = %d", len(got)) + } + if got != "ba7c997d45d06791e2f272746ed69f8351d161e5c3bdf6a0b80be3d8db895bcf" { + t.Errorf("got %q", got) + } +} + +func TestNormalizeFingerprint_Bad(t *testing.T) { + for _, c := range []string{"", "tooshort", "zz7c997d45d06791e2f272746ed69f8351d161e5c3bdf6a0b80be3d8db895bcf"} { + if _, err := normalizeFingerprint(c); err == nil { + t.Errorf("normalize(%q) = nil, want error", c) + } + } +} + +func TestTLSConfig_Build(t *testing.T) { + // Fingerprint pin produces a config with a pin verifier (and the documented + // InsecureSkipVerify=true that the verifier overrides). + c, err := (TLSConfig{Fingerprint: "ba7c997d45d06791e2f272746ed69f8351d161e5c3bdf6a0b80be3d8db895bcf"}).build() + if err != nil { + t.Fatalf("build pin: %v", err) + } + if c.VerifyPeerCertificate == nil { + t.Errorf("pin config missing VerifyPeerCertificate") + } + // Default (no trust set) uses system roots, no skip. + def, err := (TLSConfig{}).build() + if err != nil { + t.Fatalf("build default: %v", err) + } + if def.InsecureSkipVerify { + t.Errorf("default must verify") + } +} diff --git a/internal/proxmox/types.go b/internal/proxmox/types.go new file mode 100644 index 0000000..19e5adc --- /dev/null +++ b/internal/proxmox/types.go @@ -0,0 +1,164 @@ +package proxmox + +import "encoding/json" + +// Types mirror the exact JSON shapes captured from the live demo host +// (demo-felhom, PVE 9.2.2, 2026-06-08) via `pvesh get ... --output-format json`. +// Decoding ignores unknown fields, so we depend only on the fields we use. + +// Version is GET /version. +type Version struct { + Release string `json:"release"` // "9.2" + RepoID string `json:"repoid"` + Version string `json:"version"` // "9.2.2" +} + +// Node is one entry of GET /nodes. +type Node struct { + Node string `json:"node"` // node name, e.g. "demo-felhom" + Status string `json:"status"` // "online" + CPU float64 `json:"cpu"` // load fraction 0..1 + MaxCPU int `json:"maxcpu"` + Mem int64 `json:"mem"` + MaxMem int64 `json:"maxmem"` + Disk int64 `json:"disk"` + MaxDisk int64 `json:"maxdisk"` + Uptime int64 `json:"uptime"` + SSLFingerprint string `json:"ssl_fingerprint"` +} + +// NodeStatus is GET /nodes/{node}/status (host metrics; needs Sys.Audit). +type NodeStatus struct { + CPU float64 `json:"cpu"` // load fraction 0..1 + Uptime int64 `json:"uptime"` + LoadAvg []string `json:"loadavg"` // 1/5/15-min, as strings in the API + PVEVersion string `json:"pveversion"` + KVersion string `json:"kversion"` + Memory struct { + Total int64 `json:"total"` + Used int64 `json:"used"` + Free int64 `json:"free"` + Available int64 `json:"available"` + } `json:"memory"` + RootFS struct { + Total int64 `json:"total"` + Used int64 `json:"used"` + Free int64 `json:"free"` + Avail int64 `json:"avail"` + } `json:"rootfs"` + Swap struct { + Total int64 `json:"total"` + Used int64 `json:"used"` + Free int64 `json:"free"` + } `json:"swap"` + CPUInfo struct { + Cores int `json:"cores"` + CPUs int `json:"cpus"` + Sockets int `json:"sockets"` + Model string `json:"model"` + } `json:"cpuinfo"` +} + +// Guest is one entry of GET /nodes/{node}/lxc and the body of +// GET /nodes/{node}/lxc/{vmid}/status/current. The status/current response has no +// vmid field (it is in the path), so callers set VMID from the request argument. +type Guest struct { + VMID int `json:"vmid"` + Name string `json:"name"` + Status string `json:"status"` // "running" | "stopped" + Type string `json:"type"` // "lxc" + CPUs int `json:"cpus"` + CPU float64 `json:"cpu"` + Mem int64 `json:"mem"` + MaxMem int64 `json:"maxmem"` + Disk int64 `json:"disk"` + MaxDisk int64 `json:"maxdisk"` + Uptime int64 `json:"uptime"` +} + +// GuestConfig is GET /nodes/{node}/lxc/{vmid}/config. The config surface is +// dynamic (net0..netN, mp0..mpN, unusedN), so known fields are typed and the full +// raw map is preserved in Extra for the dynamic ones. +type GuestConfig struct { + Hostname string `json:"hostname"` + Arch string `json:"arch"` + Cores int `json:"cores"` + Memory int64 `json:"memory"` + Swap int64 `json:"swap"` + OSType string `json:"ostype"` + RootFS string `json:"rootfs"` + Features string `json:"features"` // e.g. "nesting=1,keyctl=1" + Unprivileged int `json:"unprivileged"` // 1 if unprivileged + Digest string `json:"digest"` + + // Extra holds every field as raw JSON, including the dynamic netN/mpN/unusedN + // keys not promoted above. + Extra map[string]json.RawMessage `json:"-"` +} + +// UnmarshalJSON fills both the typed known fields and the raw Extra map. +func (g *GuestConfig) UnmarshalJSON(b []byte) error { + type alias GuestConfig // avoid recursion + var a alias + if err := json.Unmarshal(b, &a); err != nil { + return err + } + *g = GuestConfig(a) + return json.Unmarshal(b, &g.Extra) +} + +// MountPoints returns the mpN entries (e.g. "mp0" -> "local-lvm:1,mp=/mnt/mp1,backup=0") +// pulled from Extra. Relevant for later slices' bulk-volume placement. +func (g *GuestConfig) MountPoints() map[string]string { + return g.prefixed("mp") +} + +// Nets returns the netN entries from Extra. +func (g *GuestConfig) Nets() map[string]string { + return g.prefixed("net") +} + +func (g *GuestConfig) prefixed(prefix string) map[string]string { + out := map[string]string{} + for k, raw := range g.Extra { + if len(k) <= len(prefix) || k[:len(prefix)] != prefix { + continue + } + // require the suffix to be a digit (mp0, net0 — not "memory") + if c := k[len(prefix)]; c < '0' || c > '9' { + continue + } + var s string + if json.Unmarshal(raw, &s) == nil { + out[k] = s + } + } + return out +} + +// Storage is one entry of GET /storage (cluster) and GET /nodes/{node}/storage +// (the latter adds usage fields). Unused fields stay zero. +type Storage struct { + Storage string `json:"storage"` + Type string `json:"type"` // "dir" | "lvmthin" | "nfs" | "cifs" | "pbs" + Content string `json:"content"` // comma list, e.g. "vztmpl,backup,iso,import" + Path string `json:"path,omitempty"` + Total int64 `json:"total,omitempty"` + Used int64 `json:"used,omitempty"` + Avail int64 `json:"avail,omitempty"` + Active int `json:"active,omitempty"` + Enabled int `json:"enabled,omitempty"` + Shared int `json:"shared,omitempty"` + UsedFraction float64 `json:"used_fraction,omitempty"` +} + +// StorageContent is one entry of GET /nodes/{node}/storage/{store}/content +// (e.g. vzdump archives, CT templates, guest volumes). +type StorageContent struct { + VolID string `json:"volid"` // e.g. "local:backup/vzdump-lxc-9001-...tar.zst" + Content string `json:"content"` + Format string `json:"format"` + Size int64 `json:"size"` + CTime int64 `json:"ctime"` + VMID int `json:"vmid,omitempty"` +} diff --git a/internal/proxmox/upid.go b/internal/proxmox/upid.go new file mode 100644 index 0000000..429f2d9 --- /dev/null +++ b/internal/proxmox/upid.go @@ -0,0 +1,63 @@ +package proxmox + +import ( + "fmt" + "strconv" + "strings" +) + +// UPID is a parsed Proxmox task identifier. Long operations (vzdump, restore, +// snapshot, ...) return a UPID rather than a result; the caller polls the task. +// +// Wire format (captured live, demo-felhom): +// +// UPID:demo-felhom:00026454:004E3431:6A265E53:vzdestroy:9021:root@pam: +// |node |pid-hex |pstart-hx|start-hex |worker |id |user |(trailing) +type UPID struct { + Raw string + Node string + PID uint64 // decoded from hex + PStart uint64 // decoded from hex + StartTime uint64 // decoded from hex (unix seconds) + Worker string // task type, e.g. "vzdump", "vzdestroy", "vzsnapshot" + ID string // worker target, e.g. the vmid as a string + User string // e.g. "root@pam" or "felhom-agent@pve!agent" +} + +// ParseUPID parses a Proxmox UPID string. The user field may contain '@' and '!' +// but never ':', so a plain colon-split is correct. +func ParseUPID(s string) (UPID, error) { + if !strings.HasPrefix(s, "UPID:") { + return UPID{}, fmt.Errorf("proxmox: not a UPID: %q", s) + } + // UPID:node:pid:pstart:starttime:worker:id:user: -> 9 fields, last empty + parts := strings.Split(s, ":") + if len(parts) < 8 { + return UPID{}, fmt.Errorf("proxmox: malformed UPID (%d fields): %q", len(parts), s) + } + pid, err := strconv.ParseUint(parts[2], 16, 64) + if err != nil { + return UPID{}, fmt.Errorf("proxmox: bad UPID pid %q: %w", parts[2], err) + } + pstart, err := strconv.ParseUint(parts[3], 16, 64) + if err != nil { + return UPID{}, fmt.Errorf("proxmox: bad UPID pstart %q: %w", parts[3], err) + } + start, err := strconv.ParseUint(parts[4], 16, 64) + if err != nil { + return UPID{}, fmt.Errorf("proxmox: bad UPID starttime %q: %w", parts[4], err) + } + return UPID{ + Raw: s, + Node: parts[1], + PID: pid, + PStart: pstart, + StartTime: start, + Worker: parts[5], + ID: parts[6], + User: parts[7], + }, nil +} + +// String returns the original wire form. +func (u UPID) String() string { return u.Raw } diff --git a/internal/proxmox/upid_test.go b/internal/proxmox/upid_test.go new file mode 100644 index 0000000..e598824 --- /dev/null +++ b/internal/proxmox/upid_test.go @@ -0,0 +1,59 @@ +package proxmox + +import "testing" + +func TestParseUPID(t *testing.T) { + // Captured live from demo-felhom. + const raw = "UPID:demo-felhom:00026454:004E3431:6A265E53:vzdestroy:9021:root@pam:" + u, err := ParseUPID(raw) + if err != nil { + t.Fatalf("ParseUPID: %v", err) + } + if u.Node != "demo-felhom" { + t.Errorf("node = %q", u.Node) + } + if u.Worker != "vzdestroy" { + t.Errorf("worker = %q", u.Worker) + } + if u.ID != "9021" { + t.Errorf("id = %q", u.ID) + } + if u.User != "root@pam" { + t.Errorf("user = %q", u.User) + } + if u.PID != 0x00026454 { + t.Errorf("pid = %#x, want 0x26454", u.PID) + } + if u.StartTime != 0x6A265E53 { + t.Errorf("starttime = %#x", u.StartTime) + } + if u.String() != raw { + t.Errorf("String() round-trip = %q", u.String()) + } +} + +func TestParseUPID_PrivsepTokenUser(t *testing.T) { + // The user field can contain '@' and '!' (a privsep token) but never ':'. + const raw = "UPID:demo-felhom:00001234:00005678:6A265E53:vzdump:9001:felhom-agent@pve!agent:" + u, err := ParseUPID(raw) + if err != nil { + t.Fatalf("ParseUPID: %v", err) + } + if u.User != "felhom-agent@pve!agent" { + t.Errorf("user = %q", u.User) + } +} + +func TestParseUPID_Invalid(t *testing.T) { + cases := []string{ + "", + "not-a-upid", + "UPID:node:nothex:00:00:t:1:u:", // bad pid hex + "UPID:node:00:00", // too few fields + } + for _, c := range cases { + if _, err := ParseUPID(c); err == nil { + t.Errorf("ParseUPID(%q) = nil error, want error", c) + } + } +}