Files
felhom-controller/controller/internal/backup/tier2.go
T
admin 13c6a0929a v0.57.0: stable host-storage list + per-app Tier-2 config panel
Part A of the UI-fixes/storage-spike spec.

A1: enrichHostStorageTargets sorts /api/host-metrics storage_targets
server-side and attaches friendly Hungarian labels + purpose, fixing the
#host-storage-bars reorder-on-poll bug. Display labels only — PVE storage
ids are never renamed.

A2: new GET/POST /stacks/{name}/backup Tier-2 config panel; the "2. mentés"
Beállítás button is repointed there from the dead-end deploy page. Customer
can pin a target drive or disable Tier 2; preference is preserved across the
runner's status writes. Always visible (single-SSD + non-HDD apps included).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-13 14:23:34 +02:00

394 lines
15 KiB
Go

package backup
import (
"context"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"gitea.dooplex.hu/admin/felhom-controller/internal/settings"
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
)
// Tier 2 = an off-drive (different physical disk) copy of an HDD app's recovery unit + bulk userdata.
// It is the ONLY off-drive protection that browsable HDD userdata can get — PBS can't reach bind
// mounts. Auto-enabled for every HDD app; the target is auto-picked: prefer another registered
// user-data drive (can hold bulk), else the internal SSD for SMALL units only — and the SSD is the
// guest rootfs (~8 GB), so we REFUSE rather than fill it (a size-aware headroom guard). When no
// off-drive target fits, we record an honest "needs a 2nd HDD" status instead of silently doing
// nothing useful.
const gibibyte = 1024 * 1024 * 1024
var (
errNoOffDiskTarget = errors.New("no off-drive target (single drive, app already on the system disk)")
errSSDNoHeadroom = errors.New("the internal SSD lacks headroom for this app's data — a 2nd drive is required for off-drive backup")
)
// Tier2Target is a resolved off-drive destination for an app's Tier 2 copy.
type Tier2Target struct {
NamespaceRoot string // felhom-data namespace root on the target drive
Label string // human label (UI)
IsSystemDrive bool // target is the internal SSD/system drive (DB/config only)
Reason string // why this target (Hungarian, for UI/logs)
}
// tier2FitsHeadroom reports whether a unit of unitGB fits on a system/rootfs drive while leaving a
// reserve free. Reserve = max(2 GB, 20% of total) — this is what protects the small (~8 GB) guest
// rootfs from being filled by a Tier 2 copy. Pure function (unit-tested).
func tier2FitsHeadroom(availGB, totalGB, unitGB float64) bool {
reserve := totalGB * 0.20
if reserve < 2.0 {
reserve = 2.0
}
return (availGB - unitGB) >= reserve
}
// selectTier2Target picks the off-drive destination for an app's Tier 2 copy. A customer-pinned
// target (PreferredTarget, set from the config panel) wins when it is still valid; otherwise it
// auto-picks: another user-data drive, else the internal SSD for small units (headroom-guarded).
func (m *Manager) selectTier2Target(stackName string, unitSizeBytes int64) (*Tier2Target, error) {
sourceDrive := m.GetAppDrivePath(stackName)
if sourceDrive == "" {
return nil, fmt.Errorf("no source drive for %s", stackName)
}
// 0. Honor a customer-pinned target if it is still valid (registered, schedulable, off-disk).
// An invalid pin (gone / same physical disk) silently falls through to the auto-pick.
if m.settings != nil {
if cd := m.settings.GetCrossDriveConfig(stackName); cd != nil && cd.PreferredTarget != "" {
for _, sp := range m.settings.GetSchedulableStoragePaths() {
if sp.Path != cd.PreferredTarget {
continue
}
if sp.Path == sourceDrive || system.SamePhysicalDevice(sourceDrive, sp.Path) {
break // pinned target is on the same physical disk — not off-drive; fall through
}
label := sp.Label
if label == "" {
label = filepath.Base(sp.Path)
}
return &Tier2Target{
NamespaceRoot: NamespaceRoot(sp.Path, true),
Label: label,
IsSystemDrive: false,
Reason: "kézi választás",
}, nil
}
}
}
// 1. Prefer another registered user-data drive on a DIFFERENT physical disk (can hold bulk userdata).
if m.settings != nil {
for _, sp := range m.settings.GetSchedulableStoragePaths() {
if sp.Path == sourceDrive || system.SamePhysicalDevice(sourceDrive, sp.Path) {
continue
}
label := sp.Label
if label == "" {
label = filepath.Base(sp.Path)
}
return &Tier2Target{
NamespaceRoot: NamespaceRoot(sp.Path, true), // Model A: in-guest mount IS the namespace root
Label: label,
IsSystemDrive: false,
Reason: "másik adatmeghajtó",
}, nil
}
}
// 2. Fall back to the internal SSD (system data path) — SMALL units only.
sys := m.systemDataPath
if sys == "" || system.SamePhysicalDevice(sourceDrive, sys) {
return nil, errNoOffDiskTarget // single drive / app already on the system disk
}
if !m.tier2FitsSystemDrive(sys, unitSizeBytes) {
return nil, errSSDNoHeadroom // would fill the ~8 GB rootfs — refuse, don't fill
}
return &Tier2Target{
NamespaceRoot: NamespaceRoot(sys, false), // system path is a real root → felhom-data appended
Label: "belső SSD (rendszer)",
IsSystemDrive: true,
Reason: "nincs 2. adatmeghajtó — csak az adatbázis/konfiguráció fér a belső SSD-re; a nagy fájlokhoz 2. meghajtó kell",
}, nil
}
// tier2FitsSystemDrive checks the size-aware rootfs-headroom guard for the SSD target.
func (m *Manager) tier2FitsSystemDrive(sys string, unitSizeBytes int64) bool {
di := system.GetDiskUsage(sys)
if di == nil {
return false // can't determine free space → refuse (fail-closed for the rootfs)
}
return tier2FitsHeadroom(di.AvailGB, di.TotalGB, float64(unitSizeBytes)/gibibyte)
}
// RunTier2 makes/refreshes the off-drive copy of a single HDD app's recovery unit + userdata.
// Best-effort and idempotent (rsync mirror). Records status into settings for the UI; returns an
// error only on an actual copy failure (no valid target is a recorded status, not an error).
func (m *Manager) RunTier2(stackName string) error {
// Customer turned Tier 2 off for this app (config panel) — skip without touching status.
if m.settings != nil {
if cd := m.settings.GetCrossDriveConfig(stackName); cd != nil && cd.UserDisabled {
m.logger.Printf("[INFO] [backup] Tier 2 for %s skipped — disabled by customer", stackName)
return nil
}
}
sourceDrive := m.GetAppDrivePath(stackName)
if sourceDrive == "" {
return fmt.Errorf("no source drive for %s", stackName)
}
sourceNsRoot := m.namespaceRoot(sourceDrive)
unitDir := RecoveryUnitPath(sourceNsRoot, stackName)
appDataDir := AppDataDir(sourceNsRoot, stackName)
if _, err := os.Stat(unitDir); err != nil {
return nil // no recovery unit yet — nothing to copy
}
unitSize := dirSizeBytes(unitDir) + dirSizeBytes(appDataDir)
target, err := m.selectTier2Target(stackName, unitSize)
if err != nil {
reason := tier2NoTargetReason(err)
m.recordTier2NoTarget(stackName, reason)
m.logger.Printf("[INFO] [backup] Tier 2 for %s: no off-drive target — %s", stackName, reason)
return nil
}
// Defense-in-depth off-drive guard (selection already enforced it).
if system.SamePhysicalDevice(sourceDrive, target.NamespaceRoot) {
m.recordTier2NoTarget(stackName, "a kiválasztott cél ugyanazon a fizikai lemezen van")
return nil
}
destBase := filepath.Join(target.NamespaceRoot, "backups", "secondary", stackName)
start := time.Now()
if err := rsyncMirror(unitDir, filepath.Join(destBase, "recovery-unit")); err != nil {
m.recordTier2Failure(stackName, target, err)
if m.tier2Notify != nil {
m.tier2Notify(stackName, target.Label, time.Since(start), err)
}
return fmt.Errorf("tier2 rsync unit for %s: %w", stackName, err)
}
if _, e := os.Stat(appDataDir); e == nil {
if err := rsyncMirror(appDataDir, filepath.Join(destBase, "appdata")); err != nil {
m.recordTier2Failure(stackName, target, err)
if m.tier2Notify != nil {
m.tier2Notify(stackName, target.Label, time.Since(start), err)
}
return fmt.Errorf("tier2 rsync appdata for %s: %w", stackName, err)
}
}
dur := time.Since(start)
m.recordTier2Success(stackName, target, unitSize, dur)
if m.tier2Notify != nil {
m.tier2Notify(stackName, target.Label, dur, nil)
}
m.logger.Printf("[INFO] [backup] Tier 2 copied %s → %s (%s, %s)%s",
stackName, destBase, humanizeBytes(unitSize), dur.Round(time.Second),
map[bool]string{true: " [SSD: DB/config only]", false: ""}[target.IsSystemDrive])
return nil
}
// RunAllTier2 runs Tier 2 for every deployed HDD app (apps whose data lives on an external drive —
// non-HDD apps live on the rootfs and are already inside the PBS whole-guest snapshot).
func (m *Manager) RunAllTier2() {
if m.stackProvider == nil {
return
}
var n int
for _, stack := range m.stackProvider.ListDeployedStacks() {
if m.stackProvider.GetStackHDDPath(stack.Name) == "" {
continue // not an HDD app — its data is on the rootfs, covered by PBS
}
if m.settings != nil && (m.settings.IsDisconnected(m.GetAppDrivePath(stack.Name)) ||
m.settings.IsDecommissioned(m.GetAppDrivePath(stack.Name))) {
continue
}
if err := m.RunTier2(stack.Name); err != nil {
m.logger.Printf("[WARN] [backup] Tier 2 failed for %s: %v", stack.Name, err)
}
n++
}
m.logger.Printf("[INFO] [backup] Tier 2 run complete: %d HDD app(s) processed", n)
}
// --- per-app config-panel view (drives the Tier-2 "Beállítás" page) ---
// Tier2Option is one selectable off-drive destination in the config panel.
type Tier2Option struct {
Path string // registered storage path (the value persisted as PreferredTarget)
Label string // human label for the dropdown
}
// Tier2Info is the per-app Tier-2 view the config panel renders. It exposes the effective target
// (pinned or auto), whether that is the size-limited internal SSD, the honest no-target reason, and
// the off-disk drives the customer may pin — so the control is meaningful even with a single target.
type Tier2Info struct {
IsHDDApp bool // false = the app lives on the rootfs (already inside the PBS whole-guest snapshot)
SourceDrive string // where the app's data currently lives
Disabled bool // customer turned Tier 2 off
Preferred string // customer-pinned target path ("" = automatic)
EffectiveLabel string // label of the target that WOULD be used right now
EffectiveIsSSD bool // the effective target is the internal SSD (DB/config only)
EffectiveDesc string // why this target (Hungarian)
NoTarget bool // no off-drive target fits at all
NoTargetReason string // honest reason when NoTarget
Alternatives []Tier2Option
}
// Tier2Info builds the config-panel view for one app. Read-only (no status writes).
func (m *Manager) Tier2Info(stackName string) Tier2Info {
var info Tier2Info
if m.stackProvider != nil {
info.IsHDDApp = m.stackProvider.GetStackHDDPath(stackName) != ""
}
source := m.GetAppDrivePath(stackName)
info.SourceDrive = source
if m.settings != nil {
if cd := m.settings.GetCrossDriveConfig(stackName); cd != nil {
info.Disabled = cd.UserDisabled
info.Preferred = cd.PreferredTarget
}
// Eligible alternative drives: registered, schedulable, on a DIFFERENT physical disk.
for _, sp := range m.settings.GetSchedulableStoragePaths() {
if sp.Path == source || system.SamePhysicalDevice(source, sp.Path) {
continue
}
label := sp.Label
if label == "" {
label = filepath.Base(sp.Path)
}
info.Alternatives = append(info.Alternatives, Tier2Option{Path: sp.Path, Label: label})
}
}
// Resolve what the runner WOULD pick right now (real unit size feeds the SSD headroom guard).
sourceNsRoot := m.namespaceRoot(source)
unitSize := dirSizeBytes(RecoveryUnitPath(sourceNsRoot, stackName)) + dirSizeBytes(AppDataDir(sourceNsRoot, stackName))
target, err := m.selectTier2Target(stackName, unitSize)
if err != nil {
info.NoTarget = true
info.NoTargetReason = tier2NoTargetReason(err)
return info
}
info.EffectiveLabel = target.Label
info.EffectiveIsSSD = target.IsSystemDrive
info.EffectiveDesc = target.Reason
return info
}
// --- status persistence (drives the "2. mentés" UI card) ---
// withTier2Prefs carries the customer-preference fields (UserDisabled/PreferredTarget) from any
// existing config into a freshly-built status struct, so a runner status write never clobbers them.
func (m *Manager) withTier2Prefs(stackName string, cfg *settings.CrossDriveBackup) *settings.CrossDriveBackup {
if m.settings != nil {
if existing := m.settings.GetCrossDriveConfig(stackName); existing != nil {
cfg.UserDisabled = existing.UserDisabled
cfg.PreferredTarget = existing.PreferredTarget
}
}
return cfg
}
func (m *Manager) recordTier2Success(stackName string, target *Tier2Target, sizeBytes int64, dur time.Duration) {
if m.settings == nil {
return
}
_ = m.settings.SetCrossDriveConfig(stackName, m.withTier2Prefs(stackName, &settings.CrossDriveBackup{
Enabled: true,
Method: "rsync",
DestinationPath: target.NamespaceRoot,
Schedule: "daily",
LastRun: time.Now().Format(time.RFC3339),
LastStatus: "ok",
LastDuration: dur.Round(time.Second).String(),
LastSizeHuman: humanizeBytes(sizeBytes),
}))
}
func (m *Manager) recordTier2Failure(stackName string, target *Tier2Target, cause error) {
if m.settings == nil {
return
}
_ = m.settings.SetCrossDriveConfig(stackName, m.withTier2Prefs(stackName, &settings.CrossDriveBackup{
Enabled: true,
Method: "rsync",
DestinationPath: target.NamespaceRoot,
Schedule: "daily",
LastRun: time.Now().Format(time.RFC3339),
LastStatus: "error",
LastError: cause.Error(),
}))
}
func (m *Manager) recordTier2NoTarget(stackName, reason string) {
if m.settings == nil {
return
}
_ = m.settings.SetCrossDriveConfig(stackName, m.withTier2Prefs(stackName, &settings.CrossDriveBackup{
Enabled: false,
Method: "rsync",
Schedule: "daily",
LastStatus: "no_target",
LastError: reason,
}))
}
func tier2NoTargetReason(err error) string {
switch {
case errors.Is(err, errSSDNoHeadroom):
return "nincs elég hely a belső SSD-n — a nagy fájlok off-drive mentéséhez 2. meghajtó (vagy távoli tárhely) szükséges"
case errors.Is(err, errNoOffDiskTarget):
return "nincs másik fizikai meghajtó — a 2. mentéshez 2. meghajtó szükséges"
default:
return err.Error()
}
}
// --- helpers ---
// rsyncMirror mirrors src→dst with rsync -a --delete (exact copy, browsable on disk, no versioning).
func rsyncMirror(src, dst string) error {
if err := os.MkdirAll(dst, 0755); err != nil {
return fmt.Errorf("mkdir %s: %w", dst, err)
}
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Minute)
defer cancel()
// Trailing slashes: copy the CONTENTS of src into dst.
cmd := exec.CommandContext(ctx, "rsync", "-a", "--delete", strings.TrimRight(src, "/")+"/", strings.TrimRight(dst, "/")+"/")
out, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("%v: %s", err, strings.TrimSpace(string(out)))
}
return nil
}
// dirSizeBytes returns the total size of a directory via `du -sb` (0 if absent/error).
func dirSizeBytes(dir string) int64 {
if _, err := os.Stat(dir); err != nil {
return 0
}
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
out, err := exec.CommandContext(ctx, "du", "-sb", dir).Output()
if err != nil {
return 0
}
fields := strings.Fields(string(out))
if len(fields) == 0 {
return 0
}
var size int64
if _, err := fmt.Sscanf(fields[0], "%d", &size); err != nil {
return 0
}
return size
}