a042316d6d
Stand up the felhom-agent project (module gitea.dooplex.hu/admin/felhom-agent, binary felhom-agent) and the internal/proxmox package: the typed library every other agent module calls to talk to Proxmox. - API-first Client (hand-rolled REST over net/http, PVEAPIToken auth) with typed read ops (version/nodes/status/lxc/config/storage) and async mutating ops (restore/vzdump/snapshot/rollback/delete-snapshot/setconfig/start/stop), each returning a UPID. WaitTask polls task status until stopped and asserts exitstatus OK (authz can surface at task exec, not the POST — phase1-2 §1.3). - Fenced Privileged (root-CLI) backend for the THREE proven exceptions only (keyctl pct create, USB mount/fstab, SMART/sensors); each cites why it can't be the API. Fence is structural (Client never shells out, Privileged never HTTPs) and asserted in routing_test.go. - TLS: SHA-256 leaf-cert pinning or CA file; insecure mode explicit + off by default. No blanket verification disable. - 403 -> privilege-named APIError; failed task -> privilege-named TaskError. - JSON config + env overrides (token never logged); slog logging. - cmd/felhom-agent --selftest (read-only health report) + gated --selftest=task (reversible snapshot/rollback/delete exercise of WaitTask). No daemon loop yet. - Types grounded in the spike findings and exact JSON shapes captured live from demo-felhom (PVE 9.2.2). Unit tests use a mock transport + runner. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
142 lines
4.0 KiB
Go
142 lines
4.0 KiB
Go
package proxmox
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"time"
|
|
)
|
|
|
|
// TaskStatus is GET /nodes/{node}/tasks/{upid}/status. While the task runs,
|
|
// Status == "running" and ExitStatus is empty; once it stops, Status == "stopped"
|
|
// and ExitStatus is "OK" or an error string (e.g. a 403 permission message).
|
|
type TaskStatus struct {
|
|
UPID string `json:"upid"`
|
|
ID string `json:"id"`
|
|
Node string `json:"node"`
|
|
Type string `json:"type"`
|
|
User string `json:"user"`
|
|
Status string `json:"status"` // "running" | "stopped"
|
|
ExitStatus string `json:"exitstatus"` // present once stopped
|
|
PID int64 `json:"pid"`
|
|
StartTime int64 `json:"starttime"`
|
|
}
|
|
|
|
// Running reports whether the task is still executing.
|
|
func (t TaskStatus) Running() bool { return t.Status == "running" }
|
|
|
|
// OK reports whether the task stopped successfully.
|
|
func (t TaskStatus) OK() bool { return t.Status == "stopped" && t.ExitStatus == "OK" }
|
|
|
|
// taskLogLine is one entry of GET /nodes/{node}/tasks/{upid}/log: {"n":N,"t":"..."}.
|
|
type taskLogLine struct {
|
|
N int `json:"n"`
|
|
T string `json:"t"`
|
|
}
|
|
|
|
// WaitOptions tunes WaitTask polling. Zero value yields sane defaults.
|
|
type WaitOptions struct {
|
|
// Interval is the first poll gap (default 1s).
|
|
Interval time.Duration
|
|
// MaxInterval caps the backed-off gap (default 5s).
|
|
MaxInterval time.Duration
|
|
// Timeout bounds the whole wait (default 10m). Restore/vzdump can be slow;
|
|
// callers may raise it. A zero/elapsed context deadline also stops the wait.
|
|
Timeout time.Duration
|
|
}
|
|
|
|
func (o WaitOptions) withDefaults() WaitOptions {
|
|
if o.Interval <= 0 {
|
|
o.Interval = 1 * time.Second
|
|
}
|
|
if o.MaxInterval <= 0 {
|
|
o.MaxInterval = 5 * time.Second
|
|
}
|
|
if o.Timeout <= 0 {
|
|
o.Timeout = 10 * time.Minute
|
|
}
|
|
return o
|
|
}
|
|
|
|
// TaskStatusOnce fetches the current task status (one HTTP call).
|
|
func (c *Client) TaskStatusOnce(ctx context.Context, upid string) (TaskStatus, error) {
|
|
u, err := ParseUPID(upid)
|
|
if err != nil {
|
|
return TaskStatus{}, err
|
|
}
|
|
var st TaskStatus
|
|
path := fmt.Sprintf("/nodes/%s/tasks/%s/status", u.Node, urlEscape(upid))
|
|
if err := c.get(ctx, path, &st); err != nil {
|
|
return TaskStatus{}, err
|
|
}
|
|
return st, nil
|
|
}
|
|
|
|
// TaskLogTail fetches up to limit trailing log lines for a task (for diagnosis).
|
|
func (c *Client) TaskLogTail(ctx context.Context, upid string, limit int) ([]string, error) {
|
|
u, err := ParseUPID(upid)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if limit <= 0 {
|
|
limit = 20
|
|
}
|
|
var lines []taskLogLine
|
|
path := fmt.Sprintf("/nodes/%s/tasks/%s/log?limit=%d", u.Node, urlEscape(upid), limit)
|
|
if err := c.get(ctx, path, &lines); err != nil {
|
|
return nil, err
|
|
}
|
|
out := make([]string, 0, len(lines))
|
|
for _, l := range lines {
|
|
out = append(out, l.T)
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// WaitTask polls a task until it stops, then asserts exitstatus == "OK". On any
|
|
// non-OK exit it returns a *TaskError carrying the exitstatus, the parsed
|
|
// privilege (if it was a permission failure), and a tail of the task log.
|
|
//
|
|
// This is the contract for EVERY mutating op: the POST's HTTP 200 is not proof of
|
|
// success — authorization can fail at task execution (phase1-2 §1.3).
|
|
func (c *Client) WaitTask(ctx context.Context, upid string, opts WaitOptions) (TaskStatus, error) {
|
|
opts = opts.withDefaults()
|
|
if _, err := ParseUPID(upid); err != nil {
|
|
return TaskStatus{}, err
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(ctx, opts.Timeout)
|
|
defer cancel()
|
|
|
|
interval := opts.Interval
|
|
timer := time.NewTimer(0) // first poll immediately
|
|
defer timer.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return TaskStatus{}, fmt.Errorf("proxmox: waiting for task %s: %w", upid, ctx.Err())
|
|
case <-timer.C:
|
|
}
|
|
|
|
st, err := c.TaskStatusOnce(ctx, upid)
|
|
if err != nil {
|
|
return TaskStatus{}, err
|
|
}
|
|
if st.Running() || st.Status == "" {
|
|
// back off, capped
|
|
interval *= 2
|
|
if interval > opts.MaxInterval {
|
|
interval = opts.MaxInterval
|
|
}
|
|
timer.Reset(interval)
|
|
continue
|
|
}
|
|
// stopped
|
|
if st.ExitStatus == "OK" {
|
|
return st, nil
|
|
}
|
|
tail, _ := c.TaskLogTail(ctx, upid, 20) // best-effort
|
|
return st, newTaskError(upid, st.ExitStatus, tail)
|
|
}
|
|
}
|