package reconcile import ( "context" "time" ) // Recover consumes the journal's in-flight set at startup: resume-or-rollback for any // op that was mid-execution when the agent crashed (doc 03 §10). This MUST run before // the engine begins issuing new mutations. // // Why it is load-bearing for signed destructive ops (and why it lands with the gate): // the idempotency-key store dedupes a COMPLETED op, but an op that crashed AFTER the // Proxmox POST and BEFORE its terminal record (OpTaskRunning) is not covered by that — // its nonce is already consumed, so a redelivery is rejected as a replay, yet it never // reached a terminal state. Only this startup consumer can resolve it: re-check the // Proxmox task and record the real outcome. // // Resolution per in-flight entry: // - has a task id (OpTaskRunning): re-read the task status once. Stopped → record the // real terminal state (OK → succeeded, else failed). Still running → leave it // in-flight (a later Recover or the task's own completion resolves it). Unreadable → // leave it (cannot safely decide). // - no task id (OpStarted only): the Proxmox POST was never confirmed, so the op // never took effect — record failed (fail-safe, the documented FileNonceStore // direction). A convergent reconcile op is simply re-issued next pass; a one-shot // op did NOT mark its idempotency key applied, so it is not falsely deduped. func (e *Engine) Recover(ctx context.Context) RecoverResult { var res RecoverResult if e.journal == nil { return res } for _, entry := range e.journal.InFlight() { res.Examined++ if entry.UPID == "" { // POST never confirmed → abandon (fail-safe). e.append(terminal(entry, OpFailed)) res.RolledBack++ e.logger.Warn("recover: in-flight op had no task id; marked failed (fail-safe)", "op_id", entry.OpID, "vmid", entry.VMID, "kind", entry.Kind) continue } st, err := e.api.TaskStatusOnce(ctx, entry.UPID) if err != nil { res.Unresolved++ e.logger.Warn("recover: cannot read in-flight task status; left in-flight", "op_id", entry.OpID, "upid", entry.UPID, "err", err) continue } if st.Running() { res.StillRunning++ e.logger.Info("recover: in-flight task still running; left in-flight", "op_id", entry.OpID, "upid", entry.UPID) continue } // Stopped: record the real outcome. if st.OK() { e.append(terminal(entry, OpSucceeded)) res.Resumed++ e.logger.Info("recover: in-flight task completed OK; marked succeeded", "op_id", entry.OpID, "upid", entry.UPID) } else { e.append(terminal(entry, OpFailed)) res.Failed++ e.logger.Warn("recover: in-flight task ended non-OK; marked failed", "op_id", entry.OpID, "upid", entry.UPID, "exitstatus", st.ExitStatus) } } if res.Examined > 0 { e.logger.Info("recover: in-flight journal reconciled", "result", res) } return res } // RecoverResult summarizes a startup recovery pass. type RecoverResult struct { Examined int Resumed int // task found completed OK and recorded succeeded Failed int // task found ended non-OK and recorded failed RolledBack int // no task id → abandoned (fail-safe) StillRunning int // task still executing → left in-flight Unresolved int // task status unreadable → left in-flight } // terminal builds a terminal journal record preserving the op's identity, with the // idempotency key carried through so a SUCCEEDED one-shot op marks its key applied. func terminal(e JournalEntry, state OpState) JournalEntry { return JournalEntry{ OpID: e.OpID, VMID: e.VMID, Kind: e.Kind, UPID: e.UPID, State: state, IdempKey: e.IdempKey, At: time.Now().UTC(), } }