v0.24.0 — Pre-testing observability: debug logging, diagnostic dump, startup self-test

- Add [DEBUG] logging across all modules (backup, storage, sync, selfupdate,
  monitor, notify, report, assets, setup) gated behind logging.level: "debug"
- Add /api/debug/dump endpoint returning full controller state JSON (debug only)
- Add startup self-test validating 9 subsystems (Docker, dirs, storage, hub,
  restic repos, metrics DB) with pass/warn/fail summary
- New packages: internal/selftest, internal/util
- Constructor/signature changes: debug bool params, logger params on
  RunHealthCheck and BuildReport, smart watchdog probe logging

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-21 18:32:26 +01:00
parent 6f02536243
commit be7803c0ac
30 changed files with 1281 additions and 67 deletions
+27 -16
View File
@@ -25,6 +25,7 @@ import (
"gitea.dooplex.hu/admin/felhom-controller/internal/recovery"
"gitea.dooplex.hu/admin/felhom-controller/internal/report"
"gitea.dooplex.hu/admin/felhom-controller/internal/scheduler"
"gitea.dooplex.hu/admin/felhom-controller/internal/selftest"
"gitea.dooplex.hu/admin/felhom-controller/internal/selfupdate"
"gitea.dooplex.hu/admin/felhom-controller/internal/settings"
"gitea.dooplex.hu/admin/felhom-controller/internal/setup"
@@ -52,6 +53,8 @@ func main() {
os.Exit(0)
}
startTime := time.Now()
// --- Load configuration ---
// Use LoadPermissive to tolerate minimal configs (e.g. only domain set by docker-setup.sh).
// If even that fails (file missing/unreadable), fall back to defaults.
@@ -164,7 +167,7 @@ func main() {
}
// --- Initialize cross-drive backup runner ---
crossDriveRunner := backup.NewCrossDriveRunner(sett, stackProv, cfg.Paths.SystemDataPath, cfg.Paths.StacksDir, logger)
crossDriveRunner := backup.NewCrossDriveRunner(sett, stackProv, cfg.Paths.SystemDataPath, cfg.Paths.StacksDir, logger, cfg.Logging.Level == "debug")
// Wire cross-drive → backup manager for pre-backup DB dumps
if backupMgr != nil {
@@ -175,13 +178,13 @@ func main() {
alertMgr := web.NewAlertManager(logger)
// --- Initialize notifier ---
notifier := notify.New(cfg.Hub.URL, cfg.Hub.APIKey, cfg.Customer.ID, sett, logger)
notifier := notify.New(cfg.Hub.URL, cfg.Hub.APIKey, cfg.Customer.ID, sett, logger, cfg.Logging.Level == "debug")
// --- Initialize self-updater ---
var updater *selfupdate.Updater
if cfg.SelfUpdate.Enabled {
composePath := filepath.Join(filepath.Dir(cfg.Paths.DataDir), "docker-compose.yml")
updater = selfupdate.NewUpdater(&cfg.SelfUpdate, &cfg.Git, Version, cfg.Paths.DataDir, composePath, logger)
updater = selfupdate.NewUpdater(&cfg.SelfUpdate, &cfg.Git, Version, cfg.Paths.DataDir, composePath, logger, cfg.Logging.Level == "debug")
updater.SetBackupRunningCheck(func() bool {
return backupMgr != nil && backupMgr.IsRunning()
})
@@ -216,7 +219,7 @@ func main() {
healthInterval = 5 * time.Minute
}
sched.Every("system-health", healthInterval, func(ctx context.Context) error {
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths())
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths(), logger)
body := healthReport.FormatMessage()
healthUUID := cfg.Monitoring.PingUUIDs.SystemHealth
if healthReport.Status == "fail" {
@@ -243,7 +246,7 @@ func main() {
// --- Central hub pusher (declared early so backup closure can reference it) ---
var hubPusher *report.Pusher
if cfg.Hub.URL != "" && cfg.Hub.APIKey != "" {
hubPusher = report.NewPusher(&cfg.Hub, logger)
hubPusher = report.NewPusher(&cfg.Hub, logger, cfg.Logging.Level == "debug")
// Wire hub verification: update settings when hub reports customer status
hubPusher.OnPushResponse = func(resp *report.PushResponse) {
if resp.CustomerBlocked {
@@ -347,7 +350,7 @@ func main() {
pushInterval = 15 * time.Minute
}
sched.Every("hub-report", pushInterval, func(ctx context.Context) error {
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths())
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths(), logger)
if err := hubPusher.Push(r); err != nil {
return err
}
@@ -399,7 +402,7 @@ func main() {
// --- Storage watchdog ---
storageWatchdog := monitor.NewStorageWatchdog(sett, &watchdogStackAdapter{mgr: stackMgr}, notifier, cfg, logger)
storageWatchdog.SetAlertRefresh(func() {
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths())
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths(), logger)
updateAvailable := false
latestVersion := ""
if updater != nil {
@@ -413,7 +416,7 @@ func main() {
})
if hubPusher != nil {
storageWatchdog.SetHubReportPusher(func() {
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths())
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths(), logger)
hubPusher.Push(r)
})
}
@@ -430,7 +433,7 @@ func main() {
var assetsSyncer *assets.Syncer
if cfg.Hub.Enabled && cfg.Assets.SyncEnabled && cfg.Hub.URL != "" && cfg.Hub.APIKey != "" {
assetsDir := filepath.Join(cfg.Paths.DataDir, "assets")
assetsSyncer = assets.New(cfg.Hub.URL, cfg.Hub.APIKey, assetsDir, "/usr/share/felhom/assets", logger)
assetsSyncer = assets.New(cfg.Hub.URL, cfg.Hub.APIKey, assetsDir, "/usr/share/felhom/assets", logger, cfg.Logging.Level == "debug")
go func() {
time.Sleep(10 * time.Second)
if err := assetsSyncer.Sync(ctx); err != nil {
@@ -443,6 +446,9 @@ func main() {
logger.Printf("[INFO] Asset sync enabled (daily at %s from Hub)", cfg.Assets.SyncSchedule)
}
// --- Startup self-test ---
selfTestResult := selftest.Run(cfg, sett, logger)
sched.Start(ctx)
defer sched.Stop()
@@ -467,14 +473,18 @@ func main() {
time.Sleep(5 * time.Second) // Let all subsystems fully initialize
// Push controller startup event to Hub
notifier.NotifyControllerStarted(Version)
notifier.NotifyControllerStarted(Version, map[string]interface{}{
"selftest_pass": selfTestResult.Pass,
"selftest_warn": selfTestResult.Warn,
"selftest_fail": selfTestResult.Fail,
})
// Heartbeat ping
pinger.Ping(cfg.Monitoring.PingUUIDs.Heartbeat, "startup")
logger.Println("[INFO] Startup heartbeat ping sent")
// System health ping
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths())
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths(), logger)
body := healthReport.FormatMessage()
healthUUID := cfg.Monitoring.PingUUIDs.SystemHealth
if healthReport.Status == "fail" {
@@ -487,7 +497,7 @@ func main() {
// Hub report
if hubPusher != nil {
if cfg.Hub.Enabled {
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths())
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths(), logger)
var pushErr error
for attempt := 1; attempt <= 3; attempt++ {
pushErr = hubPusher.Push(r)
@@ -559,7 +569,7 @@ func main() {
// Initial alert refresh (so alerts appear immediately, not after first 5min health check)
go func() {
report := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths())
report := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths(), logger)
alertMgr.Refresh(report, cfg, backupMgr, false, "")
}()
@@ -573,6 +583,7 @@ func main() {
if assetsSyncer != nil {
apiRouter.SetAssetsSyncer(assetsSyncer)
}
apiRouter.SetDebugDumpDeps(sched, hubPusher, alertMgr, Version, startTime)
// --- Initialize web server ---
webServer := web.NewServer(cfg, stackMgr, cpuCollector, backupMgr, crossDriveRunner, sched, sett, alertMgr, notifier, updater, logger, Version)
@@ -603,7 +614,7 @@ func main() {
driveMigrator.BackupTrigger = backupMgr
}
driveMigrator.AlertRefresh = func() {
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths())
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths(), logger)
updateAvailable := false
latestVersion := ""
if updater != nil {
@@ -617,7 +628,7 @@ func main() {
}
if hubPusher != nil {
driveMigrator.PushHubReport = func() {
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths())
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths(), logger)
hubPusher.Push(r)
}
driveMigrator.PushInfraBackup = func() {
@@ -1025,7 +1036,7 @@ func writeLocalInfraBackup(cfg *config.Config, sett *settings.Settings,
return
}
backup.WriteLocalInfraBackup(data, cfg.Customer.ID, Version, ib.Timestamp, drives, logger)
backup.WriteLocalInfraBackup(data, cfg.Customer.ID, Version, ib.Timestamp, drives, logger, cfg.Logging.Level == "debug")
}
// discoverHDDPaths scans deployed apps' app.yaml for HDD_PATH env values.