v0.55.0: Phase 3 — auto off-drive Tier 2 (rootfs-headroom guard)

Tier 2 rsync-mirrors each HDD app's recovery unit + appdata to a DIFFERENT physical
disk (the only off-drive protection bind-mounted userdata can get; PBS can't reach it).
Auto-enabled, auto-target: prefer another registered drive (different physical disk via
system.SamePhysicalDevice), else the internal SSD for SMALL units only — with a
size-aware headroom guard that REFUSES rather than fill the ~8G guest rootfs, recording
an honest "needs 2nd HDD" status. Status persisted via the surviving CrossDriveBackup;
"2. mentés" UI card now populated. Daily tier2-backup job + POST /api/backup/tier2.

- backup/tier2.go (engine+selection+headroom), tier2_test.go (headroom arithmetic)
- system.SamePhysicalDevice (linux Stat_t.Dev + stub)
- handlers.go Tier2 UI population + tier2DestLabel; backups.html honest no-target reason
- fixed stale TestBackupCopiesOnPath (old felhom-data layout -> in-guest layout)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-13 13:24:49 +02:00
parent d8fe8f5ead
commit d2071430ea
12 changed files with 446 additions and 5 deletions
+3
View File
@@ -28,6 +28,9 @@ type Manager struct {
systemDataPath string // fallback drive for SSD-only apps
version string // controller version, stamped into recovery-unit manifests
// tier2Notify, if set, is called after each Tier 2 copy (success: err==nil) for notifications.
tier2Notify func(stackName, destLabel string, dur time.Duration, err error)
mu sync.Mutex
lastDBDump *DBDumpStatus
running bool
@@ -53,6 +53,11 @@ func (m *Manager) SetVersion(v string) {
m.mu.Unlock()
}
// SetTier2Notifier wires the notification callback invoked after each Tier 2 copy.
func (m *Manager) SetTier2Notifier(fn func(stackName, destLabel string, dur time.Duration, err error)) {
m.tier2Notify = fn
}
// CaptureRecoveryUnit writes/refreshes an app's secret-free recovery unit: it captures the
// compose + metadata + a secret-stripped app.yaml into compose/, enumerates the DB/volume dumps
// already present, and writes manifest.json. It NEVER writes a secret value or the Docker image.
+281
View File
@@ -0,0 +1,281 @@
package backup
import (
"context"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"gitea.dooplex.hu/admin/felhom-controller/internal/settings"
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
)
// Tier 2 = an off-drive (different physical disk) copy of an HDD app's recovery unit + bulk userdata.
// It is the ONLY off-drive protection that browsable HDD userdata can get — PBS can't reach bind
// mounts. Auto-enabled for every HDD app; the target is auto-picked: prefer another registered
// user-data drive (can hold bulk), else the internal SSD for SMALL units only — and the SSD is the
// guest rootfs (~8 GB), so we REFUSE rather than fill it (a size-aware headroom guard). When no
// off-drive target fits, we record an honest "needs a 2nd HDD" status instead of silently doing
// nothing useful.
const gibibyte = 1024 * 1024 * 1024
var (
errNoOffDiskTarget = errors.New("no off-drive target (single drive, app already on the system disk)")
errSSDNoHeadroom = errors.New("the internal SSD lacks headroom for this app's data — a 2nd drive is required for off-drive backup")
)
// Tier2Target is a resolved off-drive destination for an app's Tier 2 copy.
type Tier2Target struct {
NamespaceRoot string // felhom-data namespace root on the target drive
Label string // human label (UI)
IsSystemDrive bool // target is the internal SSD/system drive (DB/config only)
Reason string // why this target (Hungarian, for UI/logs)
}
// tier2FitsHeadroom reports whether a unit of unitGB fits on a system/rootfs drive while leaving a
// reserve free. Reserve = max(2 GB, 20% of total) — this is what protects the small (~8 GB) guest
// rootfs from being filled by a Tier 2 copy. Pure function (unit-tested).
func tier2FitsHeadroom(availGB, totalGB, unitGB float64) bool {
reserve := totalGB * 0.20
if reserve < 2.0 {
reserve = 2.0
}
return (availGB - unitGB) >= reserve
}
// selectTier2Target auto-picks the off-drive destination for an app's Tier 2 copy.
func (m *Manager) selectTier2Target(stackName string, unitSizeBytes int64) (*Tier2Target, error) {
sourceDrive := m.GetAppDrivePath(stackName)
if sourceDrive == "" {
return nil, fmt.Errorf("no source drive for %s", stackName)
}
// 1. Prefer another registered user-data drive on a DIFFERENT physical disk (can hold bulk userdata).
if m.settings != nil {
for _, sp := range m.settings.GetSchedulableStoragePaths() {
if sp.Path == sourceDrive || system.SamePhysicalDevice(sourceDrive, sp.Path) {
continue
}
label := sp.Label
if label == "" {
label = filepath.Base(sp.Path)
}
return &Tier2Target{
NamespaceRoot: NamespaceRoot(sp.Path, true), // Model A: in-guest mount IS the namespace root
Label: label,
IsSystemDrive: false,
Reason: "másik adatmeghajtó",
}, nil
}
}
// 2. Fall back to the internal SSD (system data path) — SMALL units only.
sys := m.systemDataPath
if sys == "" || system.SamePhysicalDevice(sourceDrive, sys) {
return nil, errNoOffDiskTarget // single drive / app already on the system disk
}
if !m.tier2FitsSystemDrive(sys, unitSizeBytes) {
return nil, errSSDNoHeadroom // would fill the ~8 GB rootfs — refuse, don't fill
}
return &Tier2Target{
NamespaceRoot: NamespaceRoot(sys, false), // system path is a real root → felhom-data appended
Label: "belső SSD (rendszer)",
IsSystemDrive: true,
Reason: "nincs 2. adatmeghajtó — csak az adatbázis/konfiguráció fér a belső SSD-re; a nagy fájlokhoz 2. meghajtó kell",
}, nil
}
// tier2FitsSystemDrive checks the size-aware rootfs-headroom guard for the SSD target.
func (m *Manager) tier2FitsSystemDrive(sys string, unitSizeBytes int64) bool {
di := system.GetDiskUsage(sys)
if di == nil {
return false // can't determine free space → refuse (fail-closed for the rootfs)
}
return tier2FitsHeadroom(di.AvailGB, di.TotalGB, float64(unitSizeBytes)/gibibyte)
}
// RunTier2 makes/refreshes the off-drive copy of a single HDD app's recovery unit + userdata.
// Best-effort and idempotent (rsync mirror). Records status into settings for the UI; returns an
// error only on an actual copy failure (no valid target is a recorded status, not an error).
func (m *Manager) RunTier2(stackName string) error {
sourceDrive := m.GetAppDrivePath(stackName)
if sourceDrive == "" {
return fmt.Errorf("no source drive for %s", stackName)
}
sourceNsRoot := m.namespaceRoot(sourceDrive)
unitDir := RecoveryUnitPath(sourceNsRoot, stackName)
appDataDir := AppDataDir(sourceNsRoot, stackName)
if _, err := os.Stat(unitDir); err != nil {
return nil // no recovery unit yet — nothing to copy
}
unitSize := dirSizeBytes(unitDir) + dirSizeBytes(appDataDir)
target, err := m.selectTier2Target(stackName, unitSize)
if err != nil {
reason := tier2NoTargetReason(err)
m.recordTier2NoTarget(stackName, reason)
m.logger.Printf("[INFO] [backup] Tier 2 for %s: no off-drive target — %s", stackName, reason)
return nil
}
// Defense-in-depth off-drive guard (selection already enforced it).
if system.SamePhysicalDevice(sourceDrive, target.NamespaceRoot) {
m.recordTier2NoTarget(stackName, "a kiválasztott cél ugyanazon a fizikai lemezen van")
return nil
}
destBase := filepath.Join(target.NamespaceRoot, "backups", "secondary", stackName)
start := time.Now()
if err := rsyncMirror(unitDir, filepath.Join(destBase, "recovery-unit")); err != nil {
m.recordTier2Failure(stackName, target, err)
if m.tier2Notify != nil {
m.tier2Notify(stackName, target.Label, time.Since(start), err)
}
return fmt.Errorf("tier2 rsync unit for %s: %w", stackName, err)
}
if _, e := os.Stat(appDataDir); e == nil {
if err := rsyncMirror(appDataDir, filepath.Join(destBase, "appdata")); err != nil {
m.recordTier2Failure(stackName, target, err)
if m.tier2Notify != nil {
m.tier2Notify(stackName, target.Label, time.Since(start), err)
}
return fmt.Errorf("tier2 rsync appdata for %s: %w", stackName, err)
}
}
dur := time.Since(start)
m.recordTier2Success(stackName, target, unitSize, dur)
if m.tier2Notify != nil {
m.tier2Notify(stackName, target.Label, dur, nil)
}
m.logger.Printf("[INFO] [backup] Tier 2 copied %s → %s (%s, %s)%s",
stackName, destBase, humanizeBytes(unitSize), dur.Round(time.Second),
map[bool]string{true: " [SSD: DB/config only]", false: ""}[target.IsSystemDrive])
return nil
}
// RunAllTier2 runs Tier 2 for every deployed HDD app (apps whose data lives on an external drive —
// non-HDD apps live on the rootfs and are already inside the PBS whole-guest snapshot).
func (m *Manager) RunAllTier2() {
if m.stackProvider == nil {
return
}
var n int
for _, stack := range m.stackProvider.ListDeployedStacks() {
if m.stackProvider.GetStackHDDPath(stack.Name) == "" {
continue // not an HDD app — its data is on the rootfs, covered by PBS
}
if m.settings != nil && (m.settings.IsDisconnected(m.GetAppDrivePath(stack.Name)) ||
m.settings.IsDecommissioned(m.GetAppDrivePath(stack.Name))) {
continue
}
if err := m.RunTier2(stack.Name); err != nil {
m.logger.Printf("[WARN] [backup] Tier 2 failed for %s: %v", stack.Name, err)
}
n++
}
m.logger.Printf("[INFO] [backup] Tier 2 run complete: %d HDD app(s) processed", n)
}
// --- status persistence (drives the "2. mentés" UI card) ---
func (m *Manager) recordTier2Success(stackName string, target *Tier2Target, sizeBytes int64, dur time.Duration) {
if m.settings == nil {
return
}
_ = m.settings.SetCrossDriveConfig(stackName, &settings.CrossDriveBackup{
Enabled: true,
Method: "rsync",
DestinationPath: target.NamespaceRoot,
Schedule: "daily",
LastRun: time.Now().Format(time.RFC3339),
LastStatus: "ok",
LastDuration: dur.Round(time.Second).String(),
LastSizeHuman: humanizeBytes(sizeBytes),
})
}
func (m *Manager) recordTier2Failure(stackName string, target *Tier2Target, cause error) {
if m.settings == nil {
return
}
_ = m.settings.SetCrossDriveConfig(stackName, &settings.CrossDriveBackup{
Enabled: true,
Method: "rsync",
DestinationPath: target.NamespaceRoot,
Schedule: "daily",
LastRun: time.Now().Format(time.RFC3339),
LastStatus: "error",
LastError: cause.Error(),
})
}
func (m *Manager) recordTier2NoTarget(stackName, reason string) {
if m.settings == nil {
return
}
_ = m.settings.SetCrossDriveConfig(stackName, &settings.CrossDriveBackup{
Enabled: false,
Method: "rsync",
Schedule: "daily",
LastStatus: "no_target",
LastError: reason,
})
}
func tier2NoTargetReason(err error) string {
switch {
case errors.Is(err, errSSDNoHeadroom):
return "nincs elég hely a belső SSD-n — a nagy fájlok off-drive mentéséhez 2. meghajtó (vagy távoli tárhely) szükséges"
case errors.Is(err, errNoOffDiskTarget):
return "nincs másik fizikai meghajtó — a 2. mentéshez 2. meghajtó szükséges"
default:
return err.Error()
}
}
// --- helpers ---
// rsyncMirror mirrors src→dst with rsync -a --delete (exact copy, browsable on disk, no versioning).
func rsyncMirror(src, dst string) error {
if err := os.MkdirAll(dst, 0755); err != nil {
return fmt.Errorf("mkdir %s: %w", dst, err)
}
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Minute)
defer cancel()
// Trailing slashes: copy the CONTENTS of src into dst.
cmd := exec.CommandContext(ctx, "rsync", "-a", "--delete", strings.TrimRight(src, "/")+"/", strings.TrimRight(dst, "/")+"/")
out, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("%v: %s", err, strings.TrimSpace(string(out)))
}
return nil
}
// dirSizeBytes returns the total size of a directory via `du -sb` (0 if absent/error).
func dirSizeBytes(dir string) int64 {
if _, err := os.Stat(dir); err != nil {
return 0
}
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
defer cancel()
out, err := exec.CommandContext(ctx, "du", "-sb", dir).Output()
if err != nil {
return 0
}
fields := strings.Fields(string(out))
if len(fields) == 0 {
return 0
}
var size int64
if _, err := fmt.Sscanf(fields[0], "%d", &size); err != nil {
return 0
}
return size
}
+30
View File
@@ -0,0 +1,30 @@
package backup
import "testing"
// TestTier2FitsHeadroom covers the size-aware rootfs-headroom guard that protects the ~8 GB guest
// rootfs from being filled by a Tier 2 SSD copy (reserve = max(2 GB, 20% of total)).
func TestTier2FitsHeadroom(t *testing.T) {
cases := []struct {
name string
availGB, totalGB, unitGB float64
want bool
}{
// 8 GB rootfs, ~2.4 GB free: a tiny unit fits (reserve = 2 GB), a 1 GB unit does NOT.
{"8G rootfs, tiny unit fits", 2.4, 8.0, 0.02, true},
{"8G rootfs, 1G unit refused", 2.4, 8.0, 1.0, false},
{"8G rootfs, 0.3G unit fits", 2.4, 8.0, 0.3, true},
// Reserve is the larger of 2 GB and 20%: on 8 GB, 20% = 1.6 GB < 2 GB, so 2 GB applies.
{"8G rootfs exactly at 2G reserve", 2.0, 8.0, 0.0, true},
{"8G rootfs just under reserve", 2.0, 8.0, 0.01, false},
// Large drive: 20% reserve dominates (204.8 GB on a 1 TB drive).
{"1TB drive, 50G unit fits", 500.0, 1024.0, 50.0, true},
{"1TB drive, 320G unit refused (under 20% reserve)", 500.0, 1024.0, 320.0, false},
}
for _, c := range cases {
if got := tier2FitsHeadroom(c.availGB, c.totalGB, c.unitGB); got != c.want {
t.Errorf("%s: tier2FitsHeadroom(avail=%.2f,total=%.2f,unit=%.2f)=%v want %v",
c.name, c.availGB, c.totalGB, c.unitGB, got, c.want)
}
}
}