Files
admin a042316d6d feat(agent): scaffold + proxmox interaction layer (slice 1)
Stand up the felhom-agent project (module gitea.dooplex.hu/admin/felhom-agent,
binary felhom-agent) and the internal/proxmox package: the typed library every
other agent module calls to talk to Proxmox.

- API-first Client (hand-rolled REST over net/http, PVEAPIToken auth) with typed
  read ops (version/nodes/status/lxc/config/storage) and async mutating ops
  (restore/vzdump/snapshot/rollback/delete-snapshot/setconfig/start/stop), each
  returning a UPID. WaitTask polls task status until stopped and asserts
  exitstatus OK (authz can surface at task exec, not the POST — phase1-2 §1.3).
- Fenced Privileged (root-CLI) backend for the THREE proven exceptions only
  (keyctl pct create, USB mount/fstab, SMART/sensors); each cites why it can't be
  the API. Fence is structural (Client never shells out, Privileged never HTTPs)
  and asserted in routing_test.go.
- TLS: SHA-256 leaf-cert pinning or CA file; insecure mode explicit + off by
  default. No blanket verification disable.
- 403 -> privilege-named APIError; failed task -> privilege-named TaskError.
- JSON config + env overrides (token never logged); slog logging.
- cmd/felhom-agent --selftest (read-only health report) + gated --selftest=task
  (reversible snapshot/rollback/delete exercise of WaitTask). No daemon loop yet.
- Types grounded in the spike findings and exact JSON shapes captured live from
  demo-felhom (PVE 9.2.2). Unit tests use a mock transport + runner.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-08 14:34:32 +02:00

142 lines
4.0 KiB
Go

package proxmox
import (
"context"
"fmt"
"time"
)
// TaskStatus is GET /nodes/{node}/tasks/{upid}/status. While the task runs,
// Status == "running" and ExitStatus is empty; once it stops, Status == "stopped"
// and ExitStatus is "OK" or an error string (e.g. a 403 permission message).
type TaskStatus struct {
UPID string `json:"upid"`
ID string `json:"id"`
Node string `json:"node"`
Type string `json:"type"`
User string `json:"user"`
Status string `json:"status"` // "running" | "stopped"
ExitStatus string `json:"exitstatus"` // present once stopped
PID int64 `json:"pid"`
StartTime int64 `json:"starttime"`
}
// Running reports whether the task is still executing.
func (t TaskStatus) Running() bool { return t.Status == "running" }
// OK reports whether the task stopped successfully.
func (t TaskStatus) OK() bool { return t.Status == "stopped" && t.ExitStatus == "OK" }
// taskLogLine is one entry of GET /nodes/{node}/tasks/{upid}/log: {"n":N,"t":"..."}.
type taskLogLine struct {
N int `json:"n"`
T string `json:"t"`
}
// WaitOptions tunes WaitTask polling. Zero value yields sane defaults.
type WaitOptions struct {
// Interval is the first poll gap (default 1s).
Interval time.Duration
// MaxInterval caps the backed-off gap (default 5s).
MaxInterval time.Duration
// Timeout bounds the whole wait (default 10m). Restore/vzdump can be slow;
// callers may raise it. A zero/elapsed context deadline also stops the wait.
Timeout time.Duration
}
func (o WaitOptions) withDefaults() WaitOptions {
if o.Interval <= 0 {
o.Interval = 1 * time.Second
}
if o.MaxInterval <= 0 {
o.MaxInterval = 5 * time.Second
}
if o.Timeout <= 0 {
o.Timeout = 10 * time.Minute
}
return o
}
// TaskStatusOnce fetches the current task status (one HTTP call).
func (c *Client) TaskStatusOnce(ctx context.Context, upid string) (TaskStatus, error) {
u, err := ParseUPID(upid)
if err != nil {
return TaskStatus{}, err
}
var st TaskStatus
path := fmt.Sprintf("/nodes/%s/tasks/%s/status", u.Node, urlEscape(upid))
if err := c.get(ctx, path, &st); err != nil {
return TaskStatus{}, err
}
return st, nil
}
// TaskLogTail fetches up to limit trailing log lines for a task (for diagnosis).
func (c *Client) TaskLogTail(ctx context.Context, upid string, limit int) ([]string, error) {
u, err := ParseUPID(upid)
if err != nil {
return nil, err
}
if limit <= 0 {
limit = 20
}
var lines []taskLogLine
path := fmt.Sprintf("/nodes/%s/tasks/%s/log?limit=%d", u.Node, urlEscape(upid), limit)
if err := c.get(ctx, path, &lines); err != nil {
return nil, err
}
out := make([]string, 0, len(lines))
for _, l := range lines {
out = append(out, l.T)
}
return out, nil
}
// WaitTask polls a task until it stops, then asserts exitstatus == "OK". On any
// non-OK exit it returns a *TaskError carrying the exitstatus, the parsed
// privilege (if it was a permission failure), and a tail of the task log.
//
// This is the contract for EVERY mutating op: the POST's HTTP 200 is not proof of
// success — authorization can fail at task execution (phase1-2 §1.3).
func (c *Client) WaitTask(ctx context.Context, upid string, opts WaitOptions) (TaskStatus, error) {
opts = opts.withDefaults()
if _, err := ParseUPID(upid); err != nil {
return TaskStatus{}, err
}
ctx, cancel := context.WithTimeout(ctx, opts.Timeout)
defer cancel()
interval := opts.Interval
timer := time.NewTimer(0) // first poll immediately
defer timer.Stop()
for {
select {
case <-ctx.Done():
return TaskStatus{}, fmt.Errorf("proxmox: waiting for task %s: %w", upid, ctx.Err())
case <-timer.C:
}
st, err := c.TaskStatusOnce(ctx, upid)
if err != nil {
return TaskStatus{}, err
}
if st.Running() || st.Status == "" {
// back off, capped
interval *= 2
if interval > opts.MaxInterval {
interval = opts.MaxInterval
}
timer.Reset(interval)
continue
}
// stopped
if st.ExitStatus == "OK" {
return st, nil
}
tail, _ := c.TaskLogTail(ctx, upid, 20) // best-effort
return st, newTaskError(upid, st.ExitStatus, tail)
}
}