feat(agent): scaffold + proxmox interaction layer (slice 1)
Stand up the felhom-agent project (module gitea.dooplex.hu/admin/felhom-agent, binary felhom-agent) and the internal/proxmox package: the typed library every other agent module calls to talk to Proxmox. - API-first Client (hand-rolled REST over net/http, PVEAPIToken auth) with typed read ops (version/nodes/status/lxc/config/storage) and async mutating ops (restore/vzdump/snapshot/rollback/delete-snapshot/setconfig/start/stop), each returning a UPID. WaitTask polls task status until stopped and asserts exitstatus OK (authz can surface at task exec, not the POST — phase1-2 §1.3). - Fenced Privileged (root-CLI) backend for the THREE proven exceptions only (keyctl pct create, USB mount/fstab, SMART/sensors); each cites why it can't be the API. Fence is structural (Client never shells out, Privileged never HTTPs) and asserted in routing_test.go. - TLS: SHA-256 leaf-cert pinning or CA file; insecure mode explicit + off by default. No blanket verification disable. - 403 -> privilege-named APIError; failed task -> privilege-named TaskError. - JSON config + env overrides (token never logged); slog logging. - cmd/felhom-agent --selftest (read-only health report) + gated --selftest=task (reversible snapshot/rollback/delete exercise of WaitTask). No daemon loop yet. - Types grounded in the spike findings and exact JSON shapes captured live from demo-felhom (PVE 9.2.2). Unit tests use a mock transport + runner. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,141 @@
|
||||
package proxmox
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
// TaskStatus is GET /nodes/{node}/tasks/{upid}/status. While the task runs,
|
||||
// Status == "running" and ExitStatus is empty; once it stops, Status == "stopped"
|
||||
// and ExitStatus is "OK" or an error string (e.g. a 403 permission message).
|
||||
type TaskStatus struct {
|
||||
UPID string `json:"upid"`
|
||||
ID string `json:"id"`
|
||||
Node string `json:"node"`
|
||||
Type string `json:"type"`
|
||||
User string `json:"user"`
|
||||
Status string `json:"status"` // "running" | "stopped"
|
||||
ExitStatus string `json:"exitstatus"` // present once stopped
|
||||
PID int64 `json:"pid"`
|
||||
StartTime int64 `json:"starttime"`
|
||||
}
|
||||
|
||||
// Running reports whether the task is still executing.
|
||||
func (t TaskStatus) Running() bool { return t.Status == "running" }
|
||||
|
||||
// OK reports whether the task stopped successfully.
|
||||
func (t TaskStatus) OK() bool { return t.Status == "stopped" && t.ExitStatus == "OK" }
|
||||
|
||||
// taskLogLine is one entry of GET /nodes/{node}/tasks/{upid}/log: {"n":N,"t":"..."}.
|
||||
type taskLogLine struct {
|
||||
N int `json:"n"`
|
||||
T string `json:"t"`
|
||||
}
|
||||
|
||||
// WaitOptions tunes WaitTask polling. Zero value yields sane defaults.
|
||||
type WaitOptions struct {
|
||||
// Interval is the first poll gap (default 1s).
|
||||
Interval time.Duration
|
||||
// MaxInterval caps the backed-off gap (default 5s).
|
||||
MaxInterval time.Duration
|
||||
// Timeout bounds the whole wait (default 10m). Restore/vzdump can be slow;
|
||||
// callers may raise it. A zero/elapsed context deadline also stops the wait.
|
||||
Timeout time.Duration
|
||||
}
|
||||
|
||||
func (o WaitOptions) withDefaults() WaitOptions {
|
||||
if o.Interval <= 0 {
|
||||
o.Interval = 1 * time.Second
|
||||
}
|
||||
if o.MaxInterval <= 0 {
|
||||
o.MaxInterval = 5 * time.Second
|
||||
}
|
||||
if o.Timeout <= 0 {
|
||||
o.Timeout = 10 * time.Minute
|
||||
}
|
||||
return o
|
||||
}
|
||||
|
||||
// TaskStatusOnce fetches the current task status (one HTTP call).
|
||||
func (c *Client) TaskStatusOnce(ctx context.Context, upid string) (TaskStatus, error) {
|
||||
u, err := ParseUPID(upid)
|
||||
if err != nil {
|
||||
return TaskStatus{}, err
|
||||
}
|
||||
var st TaskStatus
|
||||
path := fmt.Sprintf("/nodes/%s/tasks/%s/status", u.Node, urlEscape(upid))
|
||||
if err := c.get(ctx, path, &st); err != nil {
|
||||
return TaskStatus{}, err
|
||||
}
|
||||
return st, nil
|
||||
}
|
||||
|
||||
// TaskLogTail fetches up to limit trailing log lines for a task (for diagnosis).
|
||||
func (c *Client) TaskLogTail(ctx context.Context, upid string, limit int) ([]string, error) {
|
||||
u, err := ParseUPID(upid)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if limit <= 0 {
|
||||
limit = 20
|
||||
}
|
||||
var lines []taskLogLine
|
||||
path := fmt.Sprintf("/nodes/%s/tasks/%s/log?limit=%d", u.Node, urlEscape(upid), limit)
|
||||
if err := c.get(ctx, path, &lines); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out := make([]string, 0, len(lines))
|
||||
for _, l := range lines {
|
||||
out = append(out, l.T)
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// WaitTask polls a task until it stops, then asserts exitstatus == "OK". On any
|
||||
// non-OK exit it returns a *TaskError carrying the exitstatus, the parsed
|
||||
// privilege (if it was a permission failure), and a tail of the task log.
|
||||
//
|
||||
// This is the contract for EVERY mutating op: the POST's HTTP 200 is not proof of
|
||||
// success — authorization can fail at task execution (phase1-2 §1.3).
|
||||
func (c *Client) WaitTask(ctx context.Context, upid string, opts WaitOptions) (TaskStatus, error) {
|
||||
opts = opts.withDefaults()
|
||||
if _, err := ParseUPID(upid); err != nil {
|
||||
return TaskStatus{}, err
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, opts.Timeout)
|
||||
defer cancel()
|
||||
|
||||
interval := opts.Interval
|
||||
timer := time.NewTimer(0) // first poll immediately
|
||||
defer timer.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return TaskStatus{}, fmt.Errorf("proxmox: waiting for task %s: %w", upid, ctx.Err())
|
||||
case <-timer.C:
|
||||
}
|
||||
|
||||
st, err := c.TaskStatusOnce(ctx, upid)
|
||||
if err != nil {
|
||||
return TaskStatus{}, err
|
||||
}
|
||||
if st.Running() || st.Status == "" {
|
||||
// back off, capped
|
||||
interval *= 2
|
||||
if interval > opts.MaxInterval {
|
||||
interval = opts.MaxInterval
|
||||
}
|
||||
timer.Reset(interval)
|
||||
continue
|
||||
}
|
||||
// stopped
|
||||
if st.ExitStatus == "OK" {
|
||||
return st, nil
|
||||
}
|
||||
tail, _ := c.TaskLogTail(ctx, upid, 20) // best-effort
|
||||
return st, newTaskError(upid, st.ExitStatus, tail)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user