v0.24.0 — Pre-testing observability: debug logging, diagnostic dump, startup self-test
- Add [DEBUG] logging across all modules (backup, storage, sync, selfupdate, monitor, notify, report, assets, setup) gated behind logging.level: "debug" - Add /api/debug/dump endpoint returning full controller state JSON (debug only) - Add startup self-test validating 9 subsystems (Docker, dirs, storage, hub, restic repos, metrics DB) with pass/warn/fail summary - New packages: internal/selftest, internal/util - Constructor/signature changes: debug bool params, logger params on RunHealthCheck and BuildReport, smart watchdog probe logging Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -25,6 +25,7 @@ import (
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/recovery"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/report"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/scheduler"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/selftest"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/selfupdate"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/settings"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/setup"
|
||||
@@ -52,6 +53,8 @@ func main() {
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
startTime := time.Now()
|
||||
|
||||
// --- Load configuration ---
|
||||
// Use LoadPermissive to tolerate minimal configs (e.g. only domain set by docker-setup.sh).
|
||||
// If even that fails (file missing/unreadable), fall back to defaults.
|
||||
@@ -164,7 +167,7 @@ func main() {
|
||||
}
|
||||
|
||||
// --- Initialize cross-drive backup runner ---
|
||||
crossDriveRunner := backup.NewCrossDriveRunner(sett, stackProv, cfg.Paths.SystemDataPath, cfg.Paths.StacksDir, logger)
|
||||
crossDriveRunner := backup.NewCrossDriveRunner(sett, stackProv, cfg.Paths.SystemDataPath, cfg.Paths.StacksDir, logger, cfg.Logging.Level == "debug")
|
||||
|
||||
// Wire cross-drive → backup manager for pre-backup DB dumps
|
||||
if backupMgr != nil {
|
||||
@@ -175,13 +178,13 @@ func main() {
|
||||
alertMgr := web.NewAlertManager(logger)
|
||||
|
||||
// --- Initialize notifier ---
|
||||
notifier := notify.New(cfg.Hub.URL, cfg.Hub.APIKey, cfg.Customer.ID, sett, logger)
|
||||
notifier := notify.New(cfg.Hub.URL, cfg.Hub.APIKey, cfg.Customer.ID, sett, logger, cfg.Logging.Level == "debug")
|
||||
|
||||
// --- Initialize self-updater ---
|
||||
var updater *selfupdate.Updater
|
||||
if cfg.SelfUpdate.Enabled {
|
||||
composePath := filepath.Join(filepath.Dir(cfg.Paths.DataDir), "docker-compose.yml")
|
||||
updater = selfupdate.NewUpdater(&cfg.SelfUpdate, &cfg.Git, Version, cfg.Paths.DataDir, composePath, logger)
|
||||
updater = selfupdate.NewUpdater(&cfg.SelfUpdate, &cfg.Git, Version, cfg.Paths.DataDir, composePath, logger, cfg.Logging.Level == "debug")
|
||||
updater.SetBackupRunningCheck(func() bool {
|
||||
return backupMgr != nil && backupMgr.IsRunning()
|
||||
})
|
||||
@@ -216,7 +219,7 @@ func main() {
|
||||
healthInterval = 5 * time.Minute
|
||||
}
|
||||
sched.Every("system-health", healthInterval, func(ctx context.Context) error {
|
||||
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths())
|
||||
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths(), logger)
|
||||
body := healthReport.FormatMessage()
|
||||
healthUUID := cfg.Monitoring.PingUUIDs.SystemHealth
|
||||
if healthReport.Status == "fail" {
|
||||
@@ -243,7 +246,7 @@ func main() {
|
||||
// --- Central hub pusher (declared early so backup closure can reference it) ---
|
||||
var hubPusher *report.Pusher
|
||||
if cfg.Hub.URL != "" && cfg.Hub.APIKey != "" {
|
||||
hubPusher = report.NewPusher(&cfg.Hub, logger)
|
||||
hubPusher = report.NewPusher(&cfg.Hub, logger, cfg.Logging.Level == "debug")
|
||||
// Wire hub verification: update settings when hub reports customer status
|
||||
hubPusher.OnPushResponse = func(resp *report.PushResponse) {
|
||||
if resp.CustomerBlocked {
|
||||
@@ -347,7 +350,7 @@ func main() {
|
||||
pushInterval = 15 * time.Minute
|
||||
}
|
||||
sched.Every("hub-report", pushInterval, func(ctx context.Context) error {
|
||||
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths())
|
||||
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths(), logger)
|
||||
if err := hubPusher.Push(r); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -399,7 +402,7 @@ func main() {
|
||||
// --- Storage watchdog ---
|
||||
storageWatchdog := monitor.NewStorageWatchdog(sett, &watchdogStackAdapter{mgr: stackMgr}, notifier, cfg, logger)
|
||||
storageWatchdog.SetAlertRefresh(func() {
|
||||
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths())
|
||||
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths(), logger)
|
||||
updateAvailable := false
|
||||
latestVersion := ""
|
||||
if updater != nil {
|
||||
@@ -413,7 +416,7 @@ func main() {
|
||||
})
|
||||
if hubPusher != nil {
|
||||
storageWatchdog.SetHubReportPusher(func() {
|
||||
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths())
|
||||
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths(), logger)
|
||||
hubPusher.Push(r)
|
||||
})
|
||||
}
|
||||
@@ -430,7 +433,7 @@ func main() {
|
||||
var assetsSyncer *assets.Syncer
|
||||
if cfg.Hub.Enabled && cfg.Assets.SyncEnabled && cfg.Hub.URL != "" && cfg.Hub.APIKey != "" {
|
||||
assetsDir := filepath.Join(cfg.Paths.DataDir, "assets")
|
||||
assetsSyncer = assets.New(cfg.Hub.URL, cfg.Hub.APIKey, assetsDir, "/usr/share/felhom/assets", logger)
|
||||
assetsSyncer = assets.New(cfg.Hub.URL, cfg.Hub.APIKey, assetsDir, "/usr/share/felhom/assets", logger, cfg.Logging.Level == "debug")
|
||||
go func() {
|
||||
time.Sleep(10 * time.Second)
|
||||
if err := assetsSyncer.Sync(ctx); err != nil {
|
||||
@@ -443,6 +446,9 @@ func main() {
|
||||
logger.Printf("[INFO] Asset sync enabled (daily at %s from Hub)", cfg.Assets.SyncSchedule)
|
||||
}
|
||||
|
||||
// --- Startup self-test ---
|
||||
selfTestResult := selftest.Run(cfg, sett, logger)
|
||||
|
||||
sched.Start(ctx)
|
||||
defer sched.Stop()
|
||||
|
||||
@@ -467,14 +473,18 @@ func main() {
|
||||
time.Sleep(5 * time.Second) // Let all subsystems fully initialize
|
||||
|
||||
// Push controller startup event to Hub
|
||||
notifier.NotifyControllerStarted(Version)
|
||||
notifier.NotifyControllerStarted(Version, map[string]interface{}{
|
||||
"selftest_pass": selfTestResult.Pass,
|
||||
"selftest_warn": selfTestResult.Warn,
|
||||
"selftest_fail": selfTestResult.Fail,
|
||||
})
|
||||
|
||||
// Heartbeat ping
|
||||
pinger.Ping(cfg.Monitoring.PingUUIDs.Heartbeat, "startup")
|
||||
logger.Println("[INFO] Startup heartbeat ping sent")
|
||||
|
||||
// System health ping
|
||||
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths())
|
||||
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths(), logger)
|
||||
body := healthReport.FormatMessage()
|
||||
healthUUID := cfg.Monitoring.PingUUIDs.SystemHealth
|
||||
if healthReport.Status == "fail" {
|
||||
@@ -487,7 +497,7 @@ func main() {
|
||||
// Hub report
|
||||
if hubPusher != nil {
|
||||
if cfg.Hub.Enabled {
|
||||
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths())
|
||||
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths(), logger)
|
||||
var pushErr error
|
||||
for attempt := 1; attempt <= 3; attempt++ {
|
||||
pushErr = hubPusher.Push(r)
|
||||
@@ -559,7 +569,7 @@ func main() {
|
||||
|
||||
// Initial alert refresh (so alerts appear immediately, not after first 5min health check)
|
||||
go func() {
|
||||
report := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths())
|
||||
report := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths(), logger)
|
||||
alertMgr.Refresh(report, cfg, backupMgr, false, "")
|
||||
}()
|
||||
|
||||
@@ -573,6 +583,7 @@ func main() {
|
||||
if assetsSyncer != nil {
|
||||
apiRouter.SetAssetsSyncer(assetsSyncer)
|
||||
}
|
||||
apiRouter.SetDebugDumpDeps(sched, hubPusher, alertMgr, Version, startTime)
|
||||
|
||||
// --- Initialize web server ---
|
||||
webServer := web.NewServer(cfg, stackMgr, cpuCollector, backupMgr, crossDriveRunner, sched, sett, alertMgr, notifier, updater, logger, Version)
|
||||
@@ -603,7 +614,7 @@ func main() {
|
||||
driveMigrator.BackupTrigger = backupMgr
|
||||
}
|
||||
driveMigrator.AlertRefresh = func() {
|
||||
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths())
|
||||
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths(), logger)
|
||||
updateAvailable := false
|
||||
latestVersion := ""
|
||||
if updater != nil {
|
||||
@@ -617,7 +628,7 @@ func main() {
|
||||
}
|
||||
if hubPusher != nil {
|
||||
driveMigrator.PushHubReport = func() {
|
||||
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths())
|
||||
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths(), logger)
|
||||
hubPusher.Push(r)
|
||||
}
|
||||
driveMigrator.PushInfraBackup = func() {
|
||||
@@ -1025,7 +1036,7 @@ func writeLocalInfraBackup(cfg *config.Config, sett *settings.Settings,
|
||||
return
|
||||
}
|
||||
|
||||
backup.WriteLocalInfraBackup(data, cfg.Customer.ID, Version, ib.Timestamp, drives, logger)
|
||||
backup.WriteLocalInfraBackup(data, cfg.Customer.ID, Version, ib.Timestamp, drives, logger, cfg.Logging.Level == "debug")
|
||||
}
|
||||
|
||||
// discoverHDDPaths scans deployed apps' app.yaml for HDD_PATH env values.
|
||||
|
||||
Reference in New Issue
Block a user