v0.24.0 — Pre-testing observability: debug logging, diagnostic dump, startup self-test
- Add [DEBUG] logging across all modules (backup, storage, sync, selfupdate, monitor, notify, report, assets, setup) gated behind logging.level: "debug" - Add /api/debug/dump endpoint returning full controller state JSON (debug only) - Add startup self-test validating 9 subsystems (Docker, dirs, storage, hub, restic repos, metrics DB) with pass/warn/fail summary - New packages: internal/selftest, internal/util - Constructor/signature changes: debug bool params, logger params on RunHealthCheck and BuildReport, smart watchdog probe logging Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -190,6 +190,15 @@ func (m *Manager) groupStacksByDrive() map[string][]StackSummary {
|
||||
drive := m.GetAppDrivePath(stack.Name)
|
||||
result[drive] = append(result[drive], stack)
|
||||
}
|
||||
if m.isDebug() {
|
||||
for drive, stacks := range result {
|
||||
names := make([]string, len(stacks))
|
||||
for i, s := range stacks {
|
||||
names[i] = s.Name
|
||||
}
|
||||
m.logger.Printf("[DEBUG] groupStacksByDrive: %s → [%s]", drive, strings.Join(names, ", "))
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -197,10 +206,18 @@ func (m *Manager) groupStacksByDrive() map[string][]StackSummary {
|
||||
func (m *Manager) activeDrives() []string {
|
||||
groups := m.groupStacksByDrive()
|
||||
var drives []string
|
||||
var disconnected []string
|
||||
for d := range groups {
|
||||
if m.settings != nil && (m.settings.IsDisconnected(d) || m.settings.IsDecommissioned(d)) {
|
||||
disconnected = append(disconnected, d)
|
||||
}
|
||||
drives = append(drives, d)
|
||||
}
|
||||
sort.Strings(drives)
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] activeDrives: %d total (%s), %d disconnected/decommissioned",
|
||||
len(drives), strings.Join(drives, ", "), len(disconnected))
|
||||
}
|
||||
return drives
|
||||
}
|
||||
|
||||
@@ -218,7 +235,7 @@ func (m *Manager) runDBDumpsInternal(ctx context.Context) error {
|
||||
start := time.Now()
|
||||
m.logger.Printf("[INFO] Starting database dump run")
|
||||
|
||||
dbs, err := DiscoverDatabases(ctx, m.logger)
|
||||
dbs, err := DiscoverDatabases(ctx, m.logger, m.isDebug())
|
||||
if err != nil {
|
||||
m.logger.Printf("[ERROR] Database discovery failed: %v", err)
|
||||
return err
|
||||
@@ -261,7 +278,7 @@ func (m *Manager) runDBDumpsInternal(ctx context.Context) error {
|
||||
|
||||
dumpDir := AppDBDumpPath(drivePath, db.StackName)
|
||||
|
||||
result := DumpOne(ctx, db, dumpDir, m.logger)
|
||||
result := DumpOne(ctx, db, dumpDir, m.logger, m.isDebug())
|
||||
results = append(results, result)
|
||||
|
||||
if result.Error != nil {
|
||||
@@ -354,6 +371,9 @@ func (m *Manager) runBackupInternal(ctx context.Context) error {
|
||||
driveCount := 0
|
||||
|
||||
for drivePath, stacks := range driveStacks {
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] runBackupInternal: processing drive %s (%d stacks)", drivePath, len(stacks))
|
||||
}
|
||||
result, err := m.backupDrive(ctx, drivePath, stacks, infraPaths)
|
||||
if err != nil {
|
||||
anyErr = err
|
||||
@@ -473,6 +493,13 @@ func (m *Manager) backupDrive(ctx context.Context, drivePath string, stacks []St
|
||||
// Deduplicate paths
|
||||
paths = dedup(paths)
|
||||
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] backupDrive %s: repo=%s, %d include paths:", drivePath, repoPath, len(paths))
|
||||
for _, p := range paths {
|
||||
m.logger.Printf("[DEBUG] %s", p)
|
||||
}
|
||||
}
|
||||
|
||||
tags := []string{"felhom", m.cfg.Customer.ID, filepath.Base(drivePath)}
|
||||
m.logger.Printf("[INFO] Backing up drive %s (%d apps, %d paths)", drivePath, len(stacks), len(paths))
|
||||
|
||||
@@ -549,15 +576,27 @@ func (m *Manager) RunIntegrityCheck(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] RunIntegrityCheck: checking %d drives", len(drives))
|
||||
}
|
||||
|
||||
var checkErr error
|
||||
for _, drive := range drives {
|
||||
repoPath := PrimaryResticRepoPath(drive)
|
||||
if !m.restic.RepoExists(repoPath) {
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] RunIntegrityCheck: skipping %s (repo does not exist)", repoPath)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] RunIntegrityCheck: checking repo %s", repoPath)
|
||||
}
|
||||
if err := m.restic.Check(repoPath); err != nil {
|
||||
m.logger.Printf("[ERROR] Restic check failed for %s: %v", repoPath, err)
|
||||
checkErr = err
|
||||
} else if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] RunIntegrityCheck: repo %s OK", repoPath)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -587,12 +626,28 @@ func (m *Manager) RunFullBackup(ctx context.Context) error {
|
||||
}
|
||||
defer m.releaseRunning()
|
||||
|
||||
if m.isDebug() {
|
||||
drives := m.activeDrives()
|
||||
driveStacks := m.groupStacksByDrive()
|
||||
totalStacks := 0
|
||||
for _, s := range driveStacks {
|
||||
totalStacks += len(s)
|
||||
}
|
||||
m.logger.Printf("[DEBUG] RunFullBackup: starting full backup — %d active drives, %d stacks", len(drives), totalStacks)
|
||||
}
|
||||
|
||||
// Step 1: DB dumps
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] RunFullBackup: phase 1 — database dumps")
|
||||
}
|
||||
if err := m.runDBDumpsInternal(ctx); err != nil {
|
||||
m.logger.Printf("[WARN] DB dump had errors, continuing with backup anyway")
|
||||
}
|
||||
|
||||
// Step 2: Restic backup
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] RunFullBackup: phase 2 — restic snapshots")
|
||||
}
|
||||
return m.runBackupInternal(ctx)
|
||||
}
|
||||
|
||||
@@ -737,7 +792,7 @@ func (m *Manager) GetStackHDDMounts(name string) []string {
|
||||
// DumpStackDB runs a database dump for containers belonging to a specific stack.
|
||||
// Dumps to the stack's home drive: <drive>/backups/primary/<stack>/db-dumps/.
|
||||
func (m *Manager) DumpStackDB(ctx context.Context, stackName string) error {
|
||||
dbs, err := DiscoverDatabases(ctx, m.logger)
|
||||
dbs, err := DiscoverDatabases(ctx, m.logger, m.isDebug())
|
||||
if err != nil {
|
||||
return fmt.Errorf("database discovery failed: %w", err)
|
||||
}
|
||||
@@ -762,7 +817,7 @@ func (m *Manager) DumpStackDB(ctx context.Context, stackName string) error {
|
||||
m.logger.Printf("[INFO] Running pre-backup DB dump for %s (%d database(s)) → %s", stackName, len(stackDBs), dumpDir)
|
||||
|
||||
for _, db := range stackDBs {
|
||||
result := DumpOne(ctx, db, dumpDir, m.logger)
|
||||
result := DumpOne(ctx, db, dumpDir, m.logger, m.isDebug())
|
||||
if result.Error != nil {
|
||||
return fmt.Errorf("DB dump failed for %s: %w", result.DB.ContainerName, result.Error)
|
||||
}
|
||||
@@ -1019,7 +1074,7 @@ func (m *Manager) RefreshCache(nextDBDump, nextBackup time.Time) {
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
if dbs, err := DiscoverDatabases(ctx, m.logger); err == nil {
|
||||
if dbs, err := DiscoverDatabases(ctx, m.logger, m.isDebug()); err == nil {
|
||||
status.DiscoveredDBs = dbs
|
||||
}
|
||||
|
||||
@@ -1172,6 +1227,11 @@ func (m *Manager) GetFullStatus(nextDBDump, nextBackup time.Time) *FullBackupSta
|
||||
}
|
||||
}
|
||||
|
||||
// isDebug returns true if logging level is "debug".
|
||||
func (m *Manager) isDebug() bool {
|
||||
return m.cfg.Logging.Level == "debug"
|
||||
}
|
||||
|
||||
func dbNames(dbs []DiscoveredDB) string {
|
||||
var names []string
|
||||
for _, db := range dbs {
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/settings"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/util"
|
||||
)
|
||||
|
||||
// DBDumper can run a database dump for a specific stack.
|
||||
@@ -29,12 +30,13 @@ type CrossDriveRunner struct {
|
||||
stacksDir string // path to stacks dir (for infra backup)
|
||||
controllerYAMLPath string // path to controller.yaml (for infra backup)
|
||||
logger *log.Logger
|
||||
debug bool
|
||||
mu sync.Mutex
|
||||
running map[string]bool // per-app running state
|
||||
}
|
||||
|
||||
// NewCrossDriveRunner creates a new CrossDriveRunner.
|
||||
func NewCrossDriveRunner(sett *settings.Settings, provider StackDataProvider, systemDataPath, stacksDir string, logger *log.Logger) *CrossDriveRunner {
|
||||
func NewCrossDriveRunner(sett *settings.Settings, provider StackDataProvider, systemDataPath, stacksDir string, logger *log.Logger, debug bool) *CrossDriveRunner {
|
||||
return &CrossDriveRunner{
|
||||
sett: sett,
|
||||
stackProvider: provider,
|
||||
@@ -42,6 +44,7 @@ func NewCrossDriveRunner(sett *settings.Settings, provider StackDataProvider, sy
|
||||
stacksDir: stacksDir,
|
||||
controllerYAMLPath: "/opt/docker/felhom-controller/controller.yaml",
|
||||
logger: logger,
|
||||
debug: debug,
|
||||
running: make(map[string]bool),
|
||||
}
|
||||
}
|
||||
@@ -67,6 +70,11 @@ func (r *CrossDriveRunner) RunAppBackup(ctx context.Context, stackName string) e
|
||||
return fmt.Errorf("cross-drive backup not configured or disabled for %s", stackName)
|
||||
}
|
||||
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] RunAppBackup: starting for %s, dest=%s, schedule=%s, method=%s",
|
||||
stackName, cfg.DestinationPath, cfg.Schedule, cfg.Method)
|
||||
}
|
||||
|
||||
// Prevent concurrent runs for the same app
|
||||
r.mu.Lock()
|
||||
if r.running[stackName] {
|
||||
@@ -84,12 +92,18 @@ func (r *CrossDriveRunner) RunAppBackup(ctx context.Context, stackName string) e
|
||||
// Check if source or destination drive is disconnected
|
||||
srcDrive := r.stackProvider.GetStackHDDPath(stackName)
|
||||
if srcDrive != "" && r.sett.IsDisconnected(srcDrive) {
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] RunAppBackup: source drive disconnected for %s: %s", stackName, srcDrive)
|
||||
}
|
||||
r.mu.Lock()
|
||||
r.running[stackName] = false
|
||||
r.mu.Unlock()
|
||||
return fmt.Errorf("source drive disconnected: %s", srcDrive)
|
||||
}
|
||||
if r.sett.IsDisconnected(cfg.DestinationPath) {
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] RunAppBackup: destination drive disconnected for %s: %s", stackName, cfg.DestinationPath)
|
||||
}
|
||||
r.mu.Lock()
|
||||
r.running[stackName] = false
|
||||
r.mu.Unlock()
|
||||
@@ -107,6 +121,9 @@ func (r *CrossDriveRunner) RunAppBackup(ctx context.Context, stackName string) e
|
||||
|
||||
// Trigger fresh DB dump for this app before cross-drive backup
|
||||
if r.dbDumper != nil {
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] RunAppBackup: triggering pre-backup DB dump for %s", stackName)
|
||||
}
|
||||
if err := r.dbDumper.DumpStackDB(ctx, stackName); err != nil {
|
||||
r.logger.Printf("[WARN] Pre-backup DB dump failed for %s: %v — proceeding with user data backup", stackName, err)
|
||||
// Non-fatal: user data backup is still valuable without fresh dump
|
||||
@@ -120,6 +137,9 @@ func (r *CrossDriveRunner) RunAppBackup(ctx context.Context, stackName string) e
|
||||
|
||||
// Resolve HDD mounts for this app (may be empty for config-only apps)
|
||||
mounts := r.stackProvider.GetStackHDDMounts(stackName)
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] RunAppBackup: %s has %d HDD mount(s): %v", stackName, len(mounts), mounts)
|
||||
}
|
||||
|
||||
// Safety: destination must not overlap with any source
|
||||
for _, m := range mounts {
|
||||
@@ -145,6 +165,9 @@ func (r *CrossDriveRunner) RunAppBackup(ctx context.Context, stackName string) e
|
||||
destDir := AppSecondaryRsyncPath(cfg.DestinationPath, stackName)
|
||||
if sz, err := dirSizeBytes(destDir); err == nil {
|
||||
sizeHuman = humanizeBytes(sz)
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] RunAppBackup: %s backup size at destination: %s", stackName, sizeHuman)
|
||||
}
|
||||
}
|
||||
|
||||
r.logger.Printf("[INFO] Cross-drive backup completed: %s (%s)", stackName, duration.Round(time.Second))
|
||||
@@ -155,6 +178,10 @@ func (r *CrossDriveRunner) RunAppBackup(ctx context.Context, stackName string) e
|
||||
// RunAllScheduled runs cross-drive backups for all apps matching the schedule.
|
||||
// Runs sequentially (disk I/O bound).
|
||||
func (r *CrossDriveRunner) RunAllScheduled(ctx context.Context, schedule string) error {
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] RunAllScheduled: starting for schedule=%s", schedule)
|
||||
}
|
||||
|
||||
// Auto-enable Tier 2 for small apps (no HDD mounts) before running backups
|
||||
r.AutoEnableSmallApps()
|
||||
|
||||
@@ -163,18 +190,39 @@ func (r *CrossDriveRunner) RunAllScheduled(ctx context.Context, schedule string)
|
||||
|
||||
configs := r.sett.GetAllCrossDriveConfigs()
|
||||
if len(configs) == 0 {
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] RunAllScheduled: no cross-drive configs found")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] RunAllScheduled: %d total cross-drive config(s) found", len(configs))
|
||||
}
|
||||
|
||||
var errs []string
|
||||
var scheduled, skippedDisabled, skippedWrongSchedule int
|
||||
for stackName, cfg := range configs {
|
||||
if !cfg.Enabled {
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] RunAllScheduled: skipping %s — disabled", stackName)
|
||||
}
|
||||
skippedDisabled++
|
||||
continue
|
||||
}
|
||||
if cfg.Schedule != schedule {
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] RunAllScheduled: skipping %s — schedule mismatch (has=%s, want=%s)", stackName, cfg.Schedule, schedule)
|
||||
}
|
||||
skippedWrongSchedule++
|
||||
continue
|
||||
}
|
||||
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] RunAllScheduled: queuing %s for backup (dest=%s)", stackName, cfg.DestinationPath)
|
||||
}
|
||||
scheduled++
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
@@ -186,6 +234,11 @@ func (r *CrossDriveRunner) RunAllScheduled(ctx context.Context, schedule string)
|
||||
}
|
||||
}
|
||||
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] RunAllScheduled: done — %d scheduled, %d disabled, %d wrong schedule, %d errors",
|
||||
scheduled, skippedDisabled, skippedWrongSchedule, len(errs))
|
||||
}
|
||||
|
||||
if len(errs) > 0 {
|
||||
return fmt.Errorf("cross-drive backup errors: %s", strings.Join(errs, "; "))
|
||||
}
|
||||
@@ -216,6 +269,9 @@ func (r *CrossDriveRunner) AnyRunning() bool {
|
||||
// (≥10 GB free, <90% used) to protect OS stability; external drives just need
|
||||
// ≥100 MB. Non-mount-point destinations are allowed with a logged warning.
|
||||
func (r *CrossDriveRunner) ValidateDestination(path string) error {
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] ValidateDestination: checking path=%s", path)
|
||||
}
|
||||
if path == "" {
|
||||
return fmt.Errorf("destination path is empty")
|
||||
}
|
||||
@@ -226,6 +282,9 @@ func (r *CrossDriveRunner) ValidateDestination(path string) error {
|
||||
return fmt.Errorf("destination %s does not exist", path)
|
||||
}
|
||||
onSystemDrive := !system.IsMountPoint(path)
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] ValidateDestination: path=%s, isMountPoint=%v", path, !onSystemDrive)
|
||||
}
|
||||
if onSystemDrive {
|
||||
r.logger.Printf("[WARN] Destination %s is not a separate mount point (system drive) — backup will proceed but data is not protected against drive failure", path)
|
||||
}
|
||||
@@ -237,6 +296,10 @@ func (r *CrossDriveRunner) ValidateDestination(path string) error {
|
||||
r.logger.Printf("[WARN] Cannot determine disk usage for %s — proceeding without space verification", path)
|
||||
return nil
|
||||
}
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] ValidateDestination: path=%s, availGB=%.1f, usedPct=%.0f%%, onSystemDrive=%v",
|
||||
path, di.AvailGB, di.UsedPercent, onSystemDrive)
|
||||
}
|
||||
if onSystemDrive {
|
||||
// System drive: protect OS stability — require ≥10 GB free and <90% used
|
||||
if di.AvailGB < 10 {
|
||||
@@ -251,6 +314,9 @@ func (r *CrossDriveRunner) ValidateDestination(path string) error {
|
||||
return fmt.Errorf("destination %s has insufficient free space (%.1f GB free)", path, di.AvailGB)
|
||||
}
|
||||
}
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] ValidateDestination: path=%s passed all checks", path)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -258,6 +324,9 @@ func (r *CrossDriveRunner) ValidateDestination(path string) error {
|
||||
|
||||
func (r *CrossDriveRunner) runRsyncBackup(ctx context.Context, stackName, destBase string, mounts []string) error {
|
||||
destDir := AppSecondaryRsyncPath(destBase, stackName)
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] runRsyncBackup: stack=%s, destBase=%s, destDir=%s, %d mount(s)", stackName, destBase, destDir, len(mounts))
|
||||
}
|
||||
if err := os.MkdirAll(destDir, 0755); err != nil {
|
||||
return fmt.Errorf("creating rsync dest dir: %w", err)
|
||||
}
|
||||
@@ -296,9 +365,16 @@ func (r *CrossDriveRunner) runRsyncBackup(ctx context.Context, stackName, destBa
|
||||
"--exclude", "backups/*.dump",
|
||||
src, dst)
|
||||
r.logger.Printf("[DEBUG] rsync: %s → %s", src, dst)
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] runRsyncBackup: rsync failed for %s: %s", srcMount, util.TruncateStr(strings.TrimSpace(string(out)), 500))
|
||||
}
|
||||
return fmt.Errorf("rsync failed for %s: %v (%s)", srcMount, err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] runRsyncBackup: rsync OK for mount %s → %s", src, dst)
|
||||
}
|
||||
}
|
||||
|
||||
// --- Copy DB dumps for this stack from its home drive ---
|
||||
@@ -423,20 +499,35 @@ func (r *CrossDriveRunner) syncInfraConfig(ctx context.Context) {
|
||||
func (r *CrossDriveRunner) AutoEnableSmallApps() {
|
||||
storagePaths := r.sett.GetStoragePaths()
|
||||
if len(storagePaths) < 2 {
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] AutoEnableSmallApps: fewer than 2 storage paths (%d) — skipping", len(storagePaths))
|
||||
}
|
||||
return // no secondary drive available
|
||||
}
|
||||
|
||||
deployed := r.stackProvider.ListDeployedStacks()
|
||||
existingConfigs := r.sett.GetAllCrossDriveConfigs()
|
||||
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] AutoEnableSmallApps: %d deployed stacks, %d existing configs, %d storage paths",
|
||||
len(deployed), len(existingConfigs), len(storagePaths))
|
||||
}
|
||||
|
||||
var autoEnabled int
|
||||
for _, stack := range deployed {
|
||||
// Skip if already has cross-drive config (user has touched it)
|
||||
if _, exists := existingConfigs[stack.Name]; exists {
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] AutoEnableSmallApps: skipping %s — already has cross-drive config", stack.Name)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip if app has HDD mounts (large user data — needs manual config)
|
||||
if mounts := r.stackProvider.GetStackHDDMounts(stack.Name); len(mounts) > 0 {
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] AutoEnableSmallApps: skipping %s — has %d HDD mount(s)", stack.Name, len(mounts))
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -450,6 +541,9 @@ func (r *CrossDriveRunner) AutoEnableSmallApps() {
|
||||
}
|
||||
}
|
||||
if destPath == "" {
|
||||
if r.debug {
|
||||
r.logger.Printf("[DEBUG] AutoEnableSmallApps: skipping %s — no suitable destination found", stack.Name)
|
||||
}
|
||||
continue // no suitable destination found
|
||||
}
|
||||
|
||||
@@ -464,8 +558,13 @@ func (r *CrossDriveRunner) AutoEnableSmallApps() {
|
||||
r.logger.Printf("[WARN] Auto-enable Tier 2 failed for %s: %v", stack.Name, err)
|
||||
continue
|
||||
}
|
||||
autoEnabled++
|
||||
r.logger.Printf("[INFO] Auto-enabled Tier 2 backup for %s → %s (no HDD mounts, daily rsync)", stack.Name, destPath)
|
||||
}
|
||||
|
||||
if r.debug && autoEnabled > 0 {
|
||||
r.logger.Printf("[DEBUG] AutoEnableSmallApps: auto-enabled %d app(s)", autoEnabled)
|
||||
}
|
||||
}
|
||||
|
||||
// --- helpers ---
|
||||
|
||||
@@ -11,6 +11,8 @@ import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/util"
|
||||
)
|
||||
|
||||
// DBType represents a database engine type.
|
||||
@@ -61,14 +63,22 @@ type DumpFileInfo struct {
|
||||
}
|
||||
|
||||
// DiscoverDatabases finds running database containers via docker ps.
|
||||
func DiscoverDatabases(ctx context.Context, logger *log.Logger) ([]DiscoveredDB, error) {
|
||||
func DiscoverDatabases(ctx context.Context, logger *log.Logger, debug bool) ([]DiscoveredDB, error) {
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] DiscoverDatabases: running docker ps to find database containers")
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, "docker", "ps", "--format", "{{.ID}}\t{{.Names}}\t{{.Image}}", "--filter", "status=running")
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("docker ps failed: %w", err)
|
||||
}
|
||||
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] DiscoverDatabases: docker ps output: %s", util.TruncateStr(strings.TrimSpace(string(out)), 500))
|
||||
}
|
||||
|
||||
var dbs []DiscoveredDB
|
||||
var skipped int
|
||||
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
if line == "" {
|
||||
@@ -87,9 +97,17 @@ func DiscoverDatabases(ctx context.Context, logger *log.Logger) ([]DiscoveredDB,
|
||||
} else if strings.Contains(image, "mariadb") || strings.Contains(image, "mysql") {
|
||||
dbType = DBTypeMariaDB
|
||||
} else {
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] DiscoverDatabases: skipping container %s (image=%s, not a database)", name, image)
|
||||
}
|
||||
skipped++
|
||||
continue
|
||||
}
|
||||
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] DiscoverDatabases: found %s container: %s (id=%s)", dbType, name, id[:12])
|
||||
}
|
||||
|
||||
db := DiscoveredDB{
|
||||
ContainerID: id,
|
||||
ContainerName: name,
|
||||
@@ -100,33 +118,49 @@ func DiscoverDatabases(ctx context.Context, logger *log.Logger) ([]DiscoveredDB,
|
||||
// Get env vars from container
|
||||
if err := populateDBEnv(ctx, &db); err != nil {
|
||||
logger.Printf("[WARN] Could not read env vars for %s: %v", name, err)
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] DiscoverDatabases: skipping %s — env read failed", name)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] DiscoverDatabases: %s → stack=%s, dbUser=%s, dbName=%s", name, db.StackName, db.DBUser, db.DBName)
|
||||
}
|
||||
|
||||
dbs = append(dbs, db)
|
||||
}
|
||||
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] DiscoverDatabases: found %d database(s), skipped %d non-DB container(s)", len(dbs), skipped)
|
||||
}
|
||||
|
||||
return dbs, nil
|
||||
}
|
||||
|
||||
// DumpAll dumps all discovered databases.
|
||||
func DumpAll(ctx context.Context, dbs []DiscoveredDB, dumpDir string, logger *log.Logger) []DumpResult {
|
||||
func DumpAll(ctx context.Context, dbs []DiscoveredDB, dumpDir string, logger *log.Logger, debug bool) []DumpResult {
|
||||
// Clean up old .tmp files (older than 1 hour)
|
||||
cleanupTmpFiles(dumpDir, logger)
|
||||
|
||||
var results []DumpResult
|
||||
for _, db := range dbs {
|
||||
result := DumpOne(ctx, db, dumpDir, logger)
|
||||
result := DumpOne(ctx, db, dumpDir, logger, debug)
|
||||
results = append(results, result)
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
||||
// DumpOne dumps a single database.
|
||||
func DumpOne(ctx context.Context, db DiscoveredDB, dumpDir string, logger *log.Logger) DumpResult {
|
||||
func DumpOne(ctx context.Context, db DiscoveredDB, dumpDir string, logger *log.Logger, debug bool) DumpResult {
|
||||
start := time.Now()
|
||||
result := DumpResult{DB: db}
|
||||
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] DumpOne: starting dump for container=%s, stack=%s, dbType=%s, dumpDir=%s",
|
||||
db.ContainerName, db.StackName, db.DBType, dumpDir)
|
||||
}
|
||||
|
||||
// Ensure dump directory exists
|
||||
if err := os.MkdirAll(dumpDir, 0755); err != nil {
|
||||
result.Error = fmt.Errorf("creating dump dir: %w", err)
|
||||
@@ -148,6 +182,9 @@ func DumpOne(ctx context.Context, db DiscoveredDB, dumpDir string, logger *log.L
|
||||
if err != nil || strings.TrimSpace(string(checkOut)) != "true" {
|
||||
result.Error = fmt.Errorf("container %s no longer running", db.ContainerName)
|
||||
result.Duration = time.Since(start)
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] DumpOne: container %s is no longer running — skipping", db.ContainerName)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -158,14 +195,29 @@ func DumpOne(ctx context.Context, db DiscoveredDB, dumpDir string, logger *log.L
|
||||
cmd = exec.CommandContext(dumpCtx, "docker", "exec", db.ContainerID,
|
||||
"pg_dump", "-U", db.DBUser, "-d", db.DBName,
|
||||
"--clean", "--if-exists", "--no-owner", "--no-privileges")
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] DumpOne: pg_dump command: docker exec %s pg_dump -U %s -d %s --clean --if-exists --no-owner --no-privileges",
|
||||
db.ContainerID[:12], db.DBUser, db.DBName)
|
||||
}
|
||||
case DBTypeMariaDB:
|
||||
// Get root password from container env
|
||||
password := getMariaDBPassword(dumpCtx, db.ContainerID)
|
||||
if password == "" {
|
||||
result.Error = fmt.Errorf("could not determine MariaDB root password for %s", db.ContainerName)
|
||||
result.Duration = time.Since(start)
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] DumpOne: MariaDB root password not found for %s — skipping", db.ContainerName)
|
||||
}
|
||||
return result
|
||||
}
|
||||
cmd = exec.CommandContext(dumpCtx, "docker", "exec", db.ContainerID,
|
||||
"mariadb-dump", "-u", "root", "-p***",
|
||||
"--single-transaction", "--routines", "--triggers", db.DBName)
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] DumpOne: mariadb-dump command: docker exec %s mariadb-dump -u root -p*** --single-transaction --routines --triggers %s",
|
||||
db.ContainerID[:12], db.DBName)
|
||||
}
|
||||
// Actual command with real password (not logged)
|
||||
cmd = exec.CommandContext(dumpCtx, "docker", "exec", db.ContainerID,
|
||||
"mariadb-dump", "-u", "root", "-p"+password,
|
||||
"--single-transaction", "--routines", "--triggers", db.DBName)
|
||||
@@ -198,6 +250,9 @@ func DumpOne(ctx context.Context, db DiscoveredDB, dumpDir string, logger *log.L
|
||||
}
|
||||
result.Error = fmt.Errorf("dump failed: %v — %s", err, errMsg)
|
||||
result.Duration = time.Since(start)
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] DumpOne: dump command failed for %s: %v", db.ContainerName, result.Error)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -207,6 +262,9 @@ func DumpOne(ctx context.Context, db DiscoveredDB, dumpDir string, logger *log.L
|
||||
os.Remove(tmpPath)
|
||||
result.Error = fmt.Errorf("dump produced empty file for %s", db.ContainerName)
|
||||
result.Duration = time.Since(start)
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] DumpOne: dump produced empty file for %s", db.ContainerName)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -225,6 +283,12 @@ func DumpOne(ctx context.Context, db DiscoveredDB, dumpDir string, logger *log.L
|
||||
// Run validation on the dump file
|
||||
result.Validation = ValidateDump(finalPath, db.DBType)
|
||||
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] DumpOne: completed %s → %s (size=%s, valid=%v, tables=%d, duration=%s)",
|
||||
db.ContainerName, filename, humanizeBytes(stat.Size()),
|
||||
result.Validation.Valid, result.Validation.TableCount, result.Duration.Round(time.Millisecond))
|
||||
}
|
||||
|
||||
logger.Printf("[INFO] DB dump: %s → %s (%s, %s, %d tables)", db.ContainerName, filename,
|
||||
humanizeBytes(stat.Size()), result.Duration.Round(time.Millisecond), result.Validation.TableCount)
|
||||
|
||||
|
||||
@@ -24,12 +24,16 @@ type InfraMetadata struct {
|
||||
|
||||
// WriteLocalInfraBackup writes the infra backup to .felhom-infra-backup/ on each drive.
|
||||
// Individual drive failures are logged but not returned — the function is best-effort.
|
||||
func WriteLocalInfraBackup(backupJSON []byte, customerID, controllerVersion, timestamp string, drives []string, logger *log.Logger) {
|
||||
func WriteLocalInfraBackup(backupJSON []byte, customerID, controllerVersion, timestamp string, drives []string, logger *log.Logger, debug bool) {
|
||||
if len(drives) == 0 {
|
||||
logger.Printf("[DEBUG] No drives configured for local infra backup")
|
||||
return
|
||||
}
|
||||
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] WriteLocalInfraBackup: payload size=%d bytes, %d target drive(s): %v", len(backupJSON), len(drives), drives)
|
||||
}
|
||||
|
||||
// Compute checksum of backup data
|
||||
hash := sha256.Sum256(backupJSON)
|
||||
checksum := hex.EncodeToString(hash[:])
|
||||
@@ -51,10 +55,16 @@ func WriteLocalInfraBackup(backupJSON []byte, customerID, controllerVersion, tim
|
||||
written := 0
|
||||
for _, drive := range drives {
|
||||
dir := InfraBackupDir(drive)
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] WriteLocalInfraBackup: writing to drive=%s, dir=%s", drive, dir)
|
||||
}
|
||||
if err := writeInfraToDir(dir, backupJSON, metaJSON); err != nil {
|
||||
logger.Printf("[WARN] Local infra backup: failed to write to %s: %v", drive, err)
|
||||
continue
|
||||
}
|
||||
if debug {
|
||||
logger.Printf("[DEBUG] WriteLocalInfraBackup: write OK to %s", drive)
|
||||
}
|
||||
written++
|
||||
}
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ func TestWriteAndReadLocalInfraBackup(t *testing.T) {
|
||||
backupJSON := []byte(`{"customer_id":"test-123","domain":"test.hu","controller_version":"v0.21.0","timestamp":"2026-02-21T10:00:00Z"}`)
|
||||
logger := testLogger(t)
|
||||
|
||||
WriteLocalInfraBackup(backupJSON, "test-123", "v0.21.0", "2026-02-21T10:00:00Z", []string{drive}, logger)
|
||||
WriteLocalInfraBackup(backupJSON, "test-123", "v0.21.0", "2026-02-21T10:00:00Z", []string{drive}, logger, false)
|
||||
|
||||
// Verify files exist
|
||||
dir := InfraBackupDir(drive)
|
||||
@@ -124,7 +124,7 @@ func TestWriteLocalInfraBackup_MultipleDrives(t *testing.T) {
|
||||
backupJSON := []byte(`{"test":"multi"}`)
|
||||
logger := testLogger(t)
|
||||
|
||||
WriteLocalInfraBackup(backupJSON, "multi-test", "v1.0", "2026-01-01T00:00:00Z", drives, logger)
|
||||
WriteLocalInfraBackup(backupJSON, "multi-test", "v1.0", "2026-01-01T00:00:00Z", drives, logger, false)
|
||||
|
||||
// All 3 should succeed
|
||||
for _, d := range drives {
|
||||
@@ -142,7 +142,7 @@ func TestWriteLocalInfraBackup_MultipleDrives(t *testing.T) {
|
||||
func TestWriteLocalInfraBackup_NoDrives(t *testing.T) {
|
||||
logger := testLogger(t)
|
||||
// Should not panic
|
||||
WriteLocalInfraBackup([]byte(`{}`), "test", "v1.0", "2026-01-01T00:00:00Z", nil, logger)
|
||||
WriteLocalInfraBackup([]byte(`{}`), "test", "v1.0", "2026-01-01T00:00:00Z", nil, logger, false)
|
||||
}
|
||||
|
||||
func contains(s, substr string) bool {
|
||||
|
||||
@@ -21,6 +21,10 @@ func (m *Manager) RestoreApp(stackName, snapshotID string) error {
|
||||
return fmt.Errorf("invalid snapshot ID: must be 8-64 lowercase hex characters")
|
||||
}
|
||||
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] RestoreApp: stack=%s, snapshotID=%s", stackName, snapshotID)
|
||||
}
|
||||
|
||||
// Prevent concurrent operations
|
||||
m.mu.Lock()
|
||||
if m.running {
|
||||
@@ -39,6 +43,10 @@ func (m *Manager) RestoreApp(stackName, snapshotID string) error {
|
||||
hddMounts := m.stackProvider.GetStackHDDMounts(stackName)
|
||||
hasHDD := len(hddMounts) > 0
|
||||
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] RestoreApp: %s has %d HDD mount(s), hasHDD=%v", stackName, len(hddMounts), hasHDD)
|
||||
}
|
||||
|
||||
// Build list of paths to restore from the snapshot
|
||||
var restorePaths []string
|
||||
|
||||
@@ -47,16 +55,25 @@ func (m *Manager) RestoreApp(stackName, snapshotID string) error {
|
||||
if ok {
|
||||
stackDir := filepath.Dir(composePath)
|
||||
restorePaths = append(restorePaths, stackDir)
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] RestoreApp: will restore config dir: %s", stackDir)
|
||||
}
|
||||
}
|
||||
|
||||
// Restore DB dump files for this stack (per-drive path)
|
||||
drivePath := m.GetAppDrivePath(stackName)
|
||||
dumpDir := AppDBDumpPath(drivePath, stackName)
|
||||
restorePaths = append(restorePaths, dumpDir)
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] RestoreApp: will restore DB dump dir: %s", dumpDir)
|
||||
}
|
||||
|
||||
// Restore HDD data (always included for apps that have it — backup is mandatory)
|
||||
if hasHDD {
|
||||
restorePaths = append(restorePaths, hddMounts...)
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] RestoreApp: will restore HDD data: %v", hddMounts)
|
||||
}
|
||||
}
|
||||
|
||||
if len(restorePaths) == 0 {
|
||||
@@ -66,17 +83,30 @@ func (m *Manager) RestoreApp(stackName, snapshotID string) error {
|
||||
// Use the app's primary restic repo
|
||||
repoPath := PrimaryResticRepoPath(drivePath)
|
||||
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] RestoreApp: using repo=%s, %d restore path(s)", repoPath, len(restorePaths))
|
||||
}
|
||||
|
||||
m.logger.Printf("[WARN] RESTORE starting: stack=%s, snapshot=%s, repo=%s, paths=%v, hasHDD=%v",
|
||||
stackName, snapshotID, repoPath, restorePaths, hasHDD)
|
||||
|
||||
// Stop the app before restore
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] RestoreApp: step 1/4 — stopping app %s", stackName)
|
||||
}
|
||||
if err := m.stackProvider.StopStack(stackName); err != nil {
|
||||
m.logger.Printf("[WARN] RESTORE could not stop %s: %v (proceeding anyway)", stackName, err)
|
||||
}
|
||||
|
||||
// Execute restore via restic
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] RestoreApp: step 2/4 — restoring data from snapshot %s", snapshotID)
|
||||
}
|
||||
if err := m.restic.RestoreAppData(repoPath, snapshotID, restorePaths); err != nil {
|
||||
m.logger.Printf("[ERROR] RESTORE failed for %s: %v", stackName, err)
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] RestoreApp: step 3/4 — restarting app %s after failure", stackName)
|
||||
}
|
||||
if startErr := m.stackProvider.StartStack(stackName); startErr != nil {
|
||||
m.logger.Printf("[WARN] RESTORE could not restart %s after failure: %v", stackName, startErr)
|
||||
}
|
||||
@@ -84,6 +114,9 @@ func (m *Manager) RestoreApp(stackName, snapshotID string) error {
|
||||
}
|
||||
|
||||
// Restart the app
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] RestoreApp: step 3/4 — restarting app %s after successful restore", stackName)
|
||||
}
|
||||
if err := m.stackProvider.StartStack(stackName); err != nil {
|
||||
m.logger.Printf("[WARN] RESTORE could not restart %s after restore: %v", stackName, err)
|
||||
}
|
||||
@@ -92,6 +125,9 @@ func (m *Manager) RestoreApp(stackName, snapshotID string) error {
|
||||
if hasHDD {
|
||||
restoreType = "full (config+DB+userdata)"
|
||||
}
|
||||
if m.isDebug() {
|
||||
m.logger.Printf("[DEBUG] RestoreApp: step 4/4 — restore completed, type=%s", restoreType)
|
||||
}
|
||||
m.logger.Printf("[INFO] RESTORE completed: stack=%s, snapshot=%s, type=%s", stackName, snapshotID, restoreType)
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user