Files
felhom-controller/controller/cmd/controller/main.go
T
admin 7863e62f29 v0.54.0: Phase 2b — restore-from-recovery-unit + fail-closed data-key gate
Restore recreates an app from its on-drive unit + the guest's own secrets,
regenerating nothing. reconcileRestoreSecrets (pure, unit-tested) merges the unit's
non-secret env with secrets recovered from the live app.yaml and FAILS CLOSED if a
data-encrypting key is unrecoverable (refuse — a PBS whole-guest restore is needed —
rather than regenerate and corrupt). Resettable secrets missing → warn + proceed.

- backup: RestoreFromRecoveryUnit (manifest -> recover secrets -> gate -> restore
  volumes -> recreate definition + redeploy w/ re-pull); falls back to volume-only.
- seams: RecoverStackSecrets/RecreateStackFromUnit (adapter +encKey),
  stacks.RedeployFromEnv. Wired into /backup/restore.
- tests: gate (refuse/proceed/verbatim) + data_key parsing.

Gate + reconcile + data_key parsing unit-tested; capture live-validated (v0.53.1).
Full readable-data e2e vs AdventureLog needs the auth-gated dashboard restore — pending.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-13 11:12:43 +02:00

1339 lines
47 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package main
import (
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"io"
"log"
"net/http"
"os"
"os/exec"
"os/signal"
"path/filepath"
"syscall"
"time"
"crypto/subtle"
"strings"
"gitea.dooplex.hu/admin/felhom-controller/internal/agentapi"
"gitea.dooplex.hu/admin/felhom-controller/internal/api"
"gitea.dooplex.hu/admin/felhom-controller/internal/appexport"
"gitea.dooplex.hu/admin/felhom-controller/internal/assets"
"gitea.dooplex.hu/admin/felhom-controller/internal/backup"
"gitea.dooplex.hu/admin/felhom-controller/internal/bootstrap"
cf "gitea.dooplex.hu/admin/felhom-controller/internal/cloudflare"
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
"gitea.dooplex.hu/admin/felhom-controller/internal/crypto"
"gitea.dooplex.hu/admin/felhom-controller/internal/integrations"
"gitea.dooplex.hu/admin/felhom-controller/internal/metrics"
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
"gitea.dooplex.hu/admin/felhom-controller/internal/notify"
"gitea.dooplex.hu/admin/felhom-controller/internal/quiesce"
"gitea.dooplex.hu/admin/felhom-controller/internal/recovery"
"gitea.dooplex.hu/admin/felhom-controller/internal/report"
"gitea.dooplex.hu/admin/felhom-controller/internal/scheduler"
"gitea.dooplex.hu/admin/felhom-controller/internal/selftest"
"gitea.dooplex.hu/admin/felhom-controller/internal/selfupdate"
"gitea.dooplex.hu/admin/felhom-controller/internal/settings"
"gitea.dooplex.hu/admin/felhom-controller/internal/setup"
"gitea.dooplex.hu/admin/felhom-controller/internal/stacks"
catalogsync "gitea.dooplex.hu/admin/felhom-controller/internal/sync"
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
"gitea.dooplex.hu/admin/felhom-controller/internal/web"
)
var (
// Set at build time via ldflags
Version = "dev"
BuildTime = "unknown"
GitCommit = "unknown"
)
func main() {
configPath := flag.String("config", "/opt/docker/felhom-controller/controller.yaml", "Path to configuration file")
showVersion := flag.Bool("version", false, "Show version and exit")
flag.Parse()
if *showVersion {
fmt.Printf("felhom-controller %s (built %s, commit %s)\n", Version, BuildTime, GitCommit)
os.Exit(0)
}
startTime := time.Now()
// --- Load configuration ---
// Use LoadPermissive to tolerate minimal configs (e.g. only domain set by docker-setup.sh).
// If even that fails (file missing/unreadable), fall back to defaults.
cfg, err := config.LoadPermissive(*configPath)
if err != nil {
cfg = config.Default()
log.Printf("[WARN] Config load failed (%s), using defaults: %v", *configPath, err)
}
logger, logBuffer := setupLogger(cfg)
// --- Bootstrap ingestion (slice 8A → v0.40.0 onboarding, doc 03 §6) ---
// On first run, if this controller is not yet configured AND the host agent's provisioning
// back-half attached a bootstrap.json config mount, PULL the full controller.yaml from the hub
// (using the bootstrap's retrieval passphrase), merge in the per-guest local_api block, and come
// up CONFIGURED — skipping setup mode. Idempotent (never clobbers an existing controller.yaml)
// and fail-safe (a malformed/absent bootstrap, or a hub outage at first boot, leaves us in setup
// mode). The adapter marks a transient hub-unreachable error as retryable (the rest are permanent).
pull := func(hubURL, customerID, retrievalPassword string) (string, error) {
y, perr := report.PullConfig(hubURL, customerID, retrievalPassword)
if perr != nil && errors.Is(perr, report.ErrHubUnreachable) {
return "", fmt.Errorf("%w: %w", bootstrap.ErrPullTransient, perr)
}
return y, perr
}
cfg = bootstrap.MaybeIngest(*configPath, cfg, logger, pull)
// --- Wire system package debug logging ---
if cfg.Logging.Level == "debug" {
system.DebugLogger = logger
}
// --- Setup mode: if no customer ID configured, run setup wizard ---
if setup.NeedsSetup(cfg) {
logger.Printf("[INFO] felhom-controller %s — setup mode", Version)
runSetupMode(cfg, logger)
return
}
// --- Local API connectivity probe (slice 8A) ---
// When seeded with a local-API endpoint, prove the controller↔agent channel at startup and
// learn this guest's mounts (placement view). Non-fatal — the controller runs regardless; a
// failure is logged for diagnosis. The full /backup/due quiesce loop lands in 8B.
probeLocalAPI(cfg, logger)
logger.Printf("[INFO] felhom-controller %s starting (customer: %s, domain: %s)",
Version, cfg.Customer.ID, cfg.Customer.Domain)
// --- Load settings ---
settingsPath := cfg.Paths.DataDir + "/settings.json"
sett, err := settings.Load(settingsPath, logger)
if err != nil {
logger.Fatalf("[FATAL] Failed to load settings from %s: %v", settingsPath, err)
}
sett.SetDebug(cfg.Logging.Level == "debug")
// --- Auto-discover storage paths from deployed apps ---
discoveredPaths := discoverHDDPaths(cfg.Paths.StacksDir, logger)
sett.AutoDiscoverStoragePaths(discoveredPaths, cfg.Paths.HDDPath, logger)
// --- Load or create encryption key ---
encKeyPath := filepath.Join(cfg.Paths.DataDir, "encryption.key")
encKey, err := crypto.LoadOrCreateKey(encKeyPath)
if err != nil {
logger.Fatalf("[FATAL] Failed to load encryption key: %v", err)
}
logger.Printf("[INFO] Encryption key loaded from %s", encKeyPath)
// --- Initialize stack manager ---
stackMgr, err := stacks.NewManager(cfg, logger)
if err != nil {
logger.Fatalf("[FATAL] Failed to initialize stack manager: %v", err)
}
stackMgr.SetEncryptionKey(encKey)
// Initial stack scan
if err := stackMgr.ScanStacks(); err != nil {
logger.Printf("[WARN] Initial stack scan failed: %v", err)
}
// Inject missing deploy fields for all deployed stacks on startup
if names := stackMgr.DeployedStackNames(); len(names) > 0 {
stackMgr.InjectMissingFields(names)
}
// Migrate existing plaintext passwords to encrypted
stackMgr.MigrateEncryption()
// --- First-boot base-infrastructure bring-up ---
// We are guaranteed configured here (setup.NeedsSetup returned false above), so deploy the base
// stack (traefik-public network → traefik → cloudflared → filebrowser) the controller needs for
// routing + external access. Runs in a goroutine so a slow first-boot image pull never delays the
// web server; non-fatal (idempotent + single-flight, the health loop re-attempts each tick).
go func() {
if err := stackMgr.EnsureBaseStack(); err != nil {
logger.Printf("[WARN] [infra] first-boot base-stack bring-up: %v", err)
}
}()
// --- Initialize catalog syncer ---
syncer := catalogsync.New(cfg, logger, stackMgr.ScanStacks, func(updated []string) {
stackMgr.InjectMissingFields(updated)
})
syncer.Start()
defer syncer.Stop()
// --- Graceful shutdown context ---
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// --- Quiesce loop (slice 8B): app-consistent backup around the agent vzdump ---
// Runs only when the local API is configured (a provisioned guest) and quiesce is enabled.
// Recover FIRST (restart any stacks left stopped by a crash mid-quiesce), then start the loop.
quiesceLoop := startQuiesceLoop(ctx, cfg, stackMgr, logger)
// --- Start CPU collector ---
cpuCollector := system.NewCPUCollector(5 * time.Second)
cpuCollector.Start(ctx)
defer cpuCollector.Stop()
// --- Initialize metrics store + collector ---
metricsDBPath := "/opt/docker/felhom-controller/data/metrics.db"
metricsStore, err := metrics.NewMetricsStore(metricsDBPath, logger)
if err != nil {
logger.Printf("[WARN] Failed to initialize metrics store: %v — monitoring disabled", err)
} else {
logger.Printf("[INFO] Metrics store opened at %s", metricsDBPath)
}
if metricsStore != nil {
defer metricsStore.Close()
metricsHDDPath := cfg.Paths.HDDPath
if p := sett.GetDefaultStoragePath(); p != "" {
metricsHDDPath = p
}
metricsCollector := metrics.NewMetricsCollector(metricsStore, cpuCollector, metricsHDDPath, logger)
metricsCollector.Start(ctx)
defer metricsCollector.Stop()
logger.Println("[INFO] Metrics collector started (60s interval)")
}
// Deprecation notice for ping UUIDs (Healthchecks pinging retired — the Hub
// now owns monitoring; disk-tier backup moved to the host agent in slice 8C).
uuids := cfg.Monitoring.PingUUIDs
if uuids.Heartbeat != "" || uuids.SystemHealth != "" || uuids.DBDump != "" || uuids.Backup != "" || uuids.BackupIntegrity != "" {
logger.Println("[INFO] Healthchecks ping UUIDs configured but no longer used — monitoring is now handled by the Hub")
}
// --- Initialize backup manager (app-data only: DB dumps + Docker-volume tars) ---
var backupMgr *backup.Manager
stackProv := &stackAdapter{
mgr: stackMgr,
getStoragePaths: func() []settings.StoragePath { return sett.GetStoragePaths() },
encKey: encKey,
}
if cfg.Backup.Enabled {
backupMgr = backup.NewManager(cfg, sett, logger)
backupMgr.SetStackProvider(stackProv)
backupMgr.SetVersion(Version)
}
// --- Initialize alert manager ---
alertMgr := web.NewAlertManager(logger)
// --- Initialize notifier ---
notifier := notify.New(cfg.Hub.URL, cfg.Hub.APIKey, cfg.Customer.ID, sett, logger, cfg.Logging.Level == "debug")
// --- Initialize self-updater ---
var updater *selfupdate.Updater
if cfg.SelfUpdate.Enabled {
composePath := filepath.Join(filepath.Dir(cfg.Paths.DataDir), "docker-compose.yml")
updater = selfupdate.NewUpdater(&cfg.SelfUpdate, &cfg.Git, Version, cfg.Paths.DataDir, composePath, logger, cfg.Logging.Level == "debug")
updater.SetBackupRunningCheck(func() bool {
return backupMgr != nil && backupMgr.IsRunning()
})
// Check for post-update state (did a previous update succeed or fail?)
if state := updater.VerifyStartup(); state != nil {
notifier.NotifyControllerUpdated(state.PreviousVersion, state.TargetVersion, state.Status == "success")
}
logger.Printf("[INFO] Self-update enabled (check every %s, auto-update: %v, auto-update time: %s)",
cfg.SelfUpdate.CheckInterval, cfg.SelfUpdate.AutoUpdate, cfg.SelfUpdate.AutoUpdateTime)
}
// --- Initialize scheduler ---
sched := scheduler.New(logger)
sched.SetDebug(cfg.Logging.Level == "debug")
// Existing periodic tasks (migrated from ad-hoc goroutines)
sched.Every("status-refresh", 30*time.Second, func(ctx context.Context) error {
return stackMgr.RefreshStatus()
})
sched.Every("stack-scan", 2*time.Minute, func(ctx context.Context) error {
return stackMgr.ScanStacks()
})
sched.Every("health-probes", 10*time.Second, func(ctx context.Context) error {
return stackMgr.RunHealthProbes()
})
// System health check — refreshes dashboard alerts and notifies on changes.
// Healthchecks.io pinging has been retired (the Hub now owns monitoring).
healthInterval, err := time.ParseDuration(cfg.Monitoring.SystemHealthInterval)
if err != nil {
healthInterval = 5 * time.Minute
}
sched.Every("system-health", healthInterval, func(ctx context.Context) error {
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths(), logger)
// Self-heal the base stack: call unconditionally every tick. EnsureBaseStack is single-flight
// + idempotent (skips running stacks ⇒ a cheap 3× docker-inspect no-op when healthy), so there
// is no need to couple to the health-report issue strings. Runs in a goroutine — never blocks
// or fails the health job.
go func() {
if err := stackMgr.EnsureBaseStack(); err != nil {
logger.Printf("[WARN] [infra] self-heal base-stack bring-up: %v", err)
}
}()
// Refresh dashboard alerts from health report
updateAvailable := false
latestVersion := ""
if updater != nil {
status := updater.GetStatus()
if status.LastCheck != nil {
updateAvailable = status.LastCheck.UpdateAvailable
latestVersion = status.LastCheck.LatestVersion
}
}
alertMgr.Refresh(healthReport, cfg, backupMgr, updateAvailable, latestVersion, sett.GetStoragePaths())
// Notify on health status changes
notifier.NotifyHealthChange(healthReport.Status, healthReport.Issues, healthReport.Warnings)
return nil
})
// --- Central hub pusher (declared early so backup closure can reference it) ---
var hubPusher *report.Pusher
if cfg.Hub.URL != "" && cfg.Hub.APIKey != "" {
hubPusher = report.NewPusher(&cfg.Hub, logger, cfg.Logging.Level == "debug")
// Wire hub verification: update settings when hub reports customer status
hubPusher.OnPushResponse = func(resp *report.PushResponse) {
if resp.CustomerBlocked {
sett.SetHubVerified(false, time.Now())
logger.Printf("[WARN] Customer blocked on Hub — new deployments may be restricted")
} else {
sett.SetHubVerified(true, time.Now())
}
}
// Wire hub push status into alert manager for dashboard alerts
alertMgr.SetHubPushStatus(func() web.HubPushStatusData {
s := hubPusher.GetStatus()
return web.HubPushStatusData{
LastAttempt: s.LastAttempt,
LastSuccess: s.LastSuccess,
LastError: s.LastError,
Consecutive: s.Consecutive,
}
})
}
// Backup daily jobs
if cfg.Backup.Enabled && backupMgr != nil {
// App-data backup: daily database dumps. Disk-tier (restic snapshots,
// cross-drive, integrity check, infra backup) has moved to the host agent.
sched.Daily("db-dump", cfg.Backup.DBDumpSchedule, func(ctx context.Context) error {
err := backupMgr.RunDBDumps(ctx)
if err != nil {
notifier.NotifyDBDumpFailed("Adatbázis mentés sikertelen", err.Error())
} else {
notifier.NotifyDBDumpCompleted(notify.DBDumpDetails{})
}
return err
})
// Cache refresh: every 5 minutes
sched.Every("backup-cache", 5*time.Minute, func(ctx context.Context) error {
nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule)
backupMgr.RefreshCache(nextDBDump)
return nil
})
}
// Metrics prune — daily at 04:00
if metricsStore != nil {
sched.Daily("metrics-prune", "04:00", func(ctx context.Context) error {
deleted, err := metricsStore.Prune(30 * 24 * time.Hour)
if err != nil {
return err
}
logger.Printf("[INFO] Pruned %d old metric rows", deleted)
return nil
})
}
// --- Central hub reporting schedule ---
if hubPusher != nil {
if cfg.Hub.Enabled {
pushInterval, err := time.ParseDuration(cfg.Hub.PushInterval)
if err != nil {
pushInterval = 15 * time.Minute
}
sched.Every("hub-report", pushInterval, func(ctx context.Context) error {
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths(), sett.GetGeoRestriction(), logger)
if err := hubPusher.Push(r); err != nil {
return err
}
// Drain pending events (e.g., DR recovery completed) after successful push
if events := sett.DrainPendingEvents(); len(events) > 0 {
for _, ev := range events {
notifier.Notify(ev.EventType, ev.Severity, ev.Message, ev.Details)
}
logger.Printf("[INFO] Drained %d pending events to Hub", len(events))
}
return nil
})
logger.Printf("[INFO] Hub reporting enabled (every %s to %s)", pushInterval, cfg.Hub.URL)
} else {
logger.Printf("[INFO] Hub reporting disabled — will send disabled notification to %s", cfg.Hub.URL)
}
}
// Self-update scheduler jobs
if cfg.SelfUpdate.Enabled && updater != nil {
// Periodic version check (populates UI, never triggers update)
checkInterval, ciErr := time.ParseDuration(cfg.SelfUpdate.CheckInterval)
if ciErr != nil {
checkInterval = 6 * time.Hour
}
sched.Every("selfupdate-check", checkInterval, func(ctx context.Context) error {
result := updater.CheckForUpdate()
if result.UpdateAvailable {
logger.Printf("[INFO] Update available: %s -> %s", result.CurrentVersion, result.LatestVersion)
}
return nil
})
// Auto-update (daily, fires after typical backup completion)
if cfg.SelfUpdate.AutoUpdate {
sched.Daily("selfupdate-auto", cfg.SelfUpdate.AutoUpdateTime, func(ctx context.Context) error {
result := updater.CheckForUpdate()
if !result.UpdateAvailable {
return nil
}
if err := updater.TriggerUpdate("auto"); err != nil {
logger.Printf("[WARN] Auto-update skipped: %v", err)
}
return nil
})
}
}
// Storage watchdog (disk disconnect/reconnect detection) has moved to the host
// agent (slice 8C) — the controller no longer owns disk-level monitoring.
// --- Asset syncer (download from Hub) ---
var assetsSyncer *assets.Syncer
if cfg.Hub.Enabled && cfg.Assets.SyncEnabled && cfg.Hub.URL != "" && cfg.Hub.APIKey != "" {
assetsDir := filepath.Join(cfg.Paths.DataDir, "assets")
assetsSyncer = assets.New(cfg.Hub.URL, cfg.Hub.APIKey, assetsDir, "/usr/share/felhom/assets", logger, cfg.Logging.Level == "debug")
go func() {
time.Sleep(10 * time.Second)
if err := assetsSyncer.Sync(ctx); err != nil {
logger.Printf("[WARN] Initial asset sync failed: %v", err)
}
}()
sched.Daily("asset-sync", cfg.Assets.SyncSchedule, func(ctx context.Context) error {
return assetsSyncer.Sync(ctx)
})
logger.Printf("[INFO] Asset sync enabled (daily at %s from Hub)", cfg.Assets.SyncSchedule)
}
// --- Startup self-test ---
selfTestResult := selftest.Run(cfg, sett, logger)
sched.Start(ctx)
defer sched.Stop()
// Generate recovery info file if retrieval password is set
if rp := sett.GetRetrievalPassword(); rp != "" {
go func() {
info := recovery.Info{
CustomerID: cfg.Customer.ID,
RetrievalPassword: rp,
HubURL: cfg.Hub.URL,
SupportEmail: "support@felhom.eu",
SupportURL: "https://felhom.eu/kapcsolat",
}
if err := recovery.GenerateRecoveryFile(info, Version, cfg.Paths.DataDir); err != nil {
logger.Printf("[WARN] Failed to generate recovery-info.txt: %v", err)
}
}()
}
// Fire startup pings + hub report immediately (don't wait for first scheduler tick)
go func() {
time.Sleep(5 * time.Second) // Let all subsystems fully initialize
// Push controller startup event to Hub
notifier.NotifyControllerStarted(Version, map[string]interface{}{
"selftest_pass": selfTestResult.Pass,
"selftest_warn": selfTestResult.Warn,
"selftest_fail": selfTestResult.Fail,
})
// Hub report
if hubPusher != nil {
if cfg.Hub.Enabled {
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths(), sett.GetGeoRestriction(), logger)
var pushErr error
for attempt := 1; attempt <= 3; attempt++ {
pushErr = hubPusher.Push(r)
if pushErr == nil {
logger.Println("[INFO] Startup hub report sent")
break
}
logger.Printf("[WARN] Startup hub report attempt %d/3 failed: %v", attempt, pushErr)
if attempt < 3 {
time.Sleep(15 * time.Second)
}
}
if pushErr != nil {
logger.Printf("[WARN] Startup hub report failed after 3 attempts — next scheduled push in %s", cfg.Hub.PushInterval)
}
} else {
// Send a minimal "disabled" notification so hub knows reporting is intentionally off
r := &report.Report{
Version: 1,
CustomerID: cfg.Customer.ID,
CustomerName: cfg.Customer.Name,
ControllerVersion: Version,
Timestamp: time.Now().UTC(),
ReportingDisabled: true,
Health: report.HealthReport{Status: "disabled", Issues: []string{}, Warnings: []string{}},
Stacks: report.StacksReport{Deployed: []string{}, Available: []string{}},
Containers: report.ContainerReport{List: []report.ContainerDetailReport{}},
}
hubPusher.PushOnce(r)
}
}
// Initial self-update check (so settings page shows version info quickly)
if updater != nil {
time.Sleep(25 * time.Second) // Additional delay after hub report
result := updater.CheckForUpdate()
if result.UpdateAvailable {
logger.Printf("[INFO] Startup: update available %s -> %s", result.CurrentVersion, result.LatestVersion)
} else if result.Error != "" {
logger.Printf("[DEBUG] Startup version check: %s", result.Error)
}
}
}()
// Initial backup cache population (don't block startup)
if cfg.Backup.Enabled && backupMgr != nil {
go func() {
nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule)
backupMgr.RefreshCache(nextDBDump)
}()
}
// Sync notification preferences to hub on startup (handles hub DB rebuild recovery)
if notifier.IsEnabled() {
go func() {
prefs := sett.GetNotificationPrefs()
if prefs.Email != "" {
if err := notifier.SyncPreferences(prefs.Email, prefs.EnabledEvents, prefs.CooldownHours); err != nil {
logger.Printf("[WARN] Failed to sync notification preferences on startup: %v", err)
}
}
}()
}
// Initial alert refresh (so alerts appear immediately, not after first 5min health check)
go func() {
report := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths(), logger)
alertMgr.Refresh(report, cfg, backupMgr, false, "")
}()
// --- Initialize API router ---
apiRouter := api.NewRouter(cfg, *configPath, sett, stackMgr, syncer, cpuCollector, backupMgr, metricsStore, updater, notifier, logger)
if hubPusher != nil {
apiRouter.OnConfigApplied = func() {
// Infra backup push is now the host agent's responsibility; the controller
// only refreshes the Hub report after a config apply.
}
}
if assetsSyncer != nil {
apiRouter.SetAssetsSyncer(assetsSyncer)
}
// --- Initialize Cloudflare geo-restriction ---
var geoSync *cf.GeoSyncManager
if cfg.Infrastructure.CFAPIToken != "" {
cfClient := cf.New(cfg.Infrastructure.CFAPIToken, logger, cfg.Logging.Level == "debug")
geoStacks := &geoStackAdapter{mgr: stackMgr, domain: cfg.Customer.Domain}
geoSync = cf.NewGeoSyncManager(cfClient, sett, cfg.Customer.Domain, geoStacks, logger)
geoSync.SetDebug(cfg.Logging.Level == "debug")
apiRouter.SetGeoSync(geoSync)
// Re-sync geo rules when apps are deployed/removed
apiRouter.OnGeoRelevantChange = func() {
geo := sett.GetGeoRestriction()
if geo != nil && geo.Enabled {
if err := geoSync.Sync(context.Background()); err != nil {
logger.Printf("[WARN] Geo sync after app change failed: %v", err)
}
}
}
// Periodic verification every 6 hours
sched.Every("geo-verify", 6*time.Hour, func(ctx context.Context) error {
geo := sett.GetGeoRestriction()
if geo == nil || !geo.Enabled {
return nil
}
return geoSync.Sync(ctx)
})
// Initial sync (delayed, non-blocking)
go func() {
time.Sleep(15 * time.Second)
if geo := sett.GetGeoRestriction(); geo != nil && geo.Enabled {
if err := geoSync.Sync(context.Background()); err != nil {
logger.Printf("[WARN] Initial geo sync failed: %v", err)
}
}
}()
logger.Printf("[INFO] Geo-restriction support enabled (CF API token configured)")
}
// --- Initialize integration manager ---
integrationStacks := &integrationStackAdapter{mgr: stackMgr}
integrationMgr := integrations.NewManager(sett, integrationStacks, cfg.Customer.Domain, cfg.Paths.StacksDir, encKey, logger)
integrationMgr.SetDebug(cfg.Logging.Level == "debug")
apiRouter.SetIntegrationManager(integrationMgr)
// --- Initialize app exporter ---
exportProv := &exportAdapter{mgr: stackMgr, encKey: encKey}
appExporter := appexport.NewExporter(exportProv, logger, Version)
appExporter.SetDebug(cfg.Logging.Level == "debug")
apiRouter.SetDebug(cfg.Logging.Level == "debug")
// --- Initialize web server ---
webServer := web.NewServer(cfg, stackMgr, cpuCollector, backupMgr, sched, sett, alertMgr, notifier, updater, logger, Version)
webServer.SetEncryptionKey(encKey)
webServer.SetAppExporter(appExporter)
webServer.SetIntegrationManager(integrationMgr)
if quiesceLoop != nil {
webServer.SetBackupTrigger(quiesceLoop) // "Mentés most" → app-consistent backup via the quiesce loop
}
if assetsSyncer != nil {
webServer.SetAssetsSyncer(assetsSyncer)
}
if hubPusher != nil {
webServer.SetHubPushStatus(func() web.HubPushStatusData {
s := hubPusher.GetStatus()
return web.HubPushStatusData{
LastAttempt: s.LastAttempt,
LastSuccess: s.LastSuccess,
LastError: s.LastError,
Consecutive: s.Consecutive,
}
})
}
if logBuffer != nil {
webServer.SetLogBuffer(logBuffer)
}
webServer.SetStartTime(startTime)
// Wire debug callbacks (only in debug mode)
if cfg.Logging.Level == "debug" {
dc := &web.DebugCallbacks{}
if hubPusher != nil {
dc.TriggerHubReportPush = func() error {
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths(), sett.GetGeoRestriction(), logger)
return hubPusher.Push(r)
}
}
dc.HubConnectivityTest = func() (int, int64, error) {
start := time.Now()
client := &http.Client{Timeout: 10 * time.Second}
resp, err := client.Get(cfg.Hub.URL + "/healthz")
latency := time.Since(start).Milliseconds()
if err != nil {
return 0, latency, err
}
resp.Body.Close()
return resp.StatusCode, latency, nil
}
if cfg.Git.RepoURL != "" {
dc.GiteaConnectivityTest = func() (int, int64, error) {
start := time.Now()
client := &http.Client{Timeout: 5 * time.Second}
resp, err := client.Head(cfg.Git.RepoURL)
latency := time.Since(start).Milliseconds()
if err != nil {
return 0, latency, err
}
resp.Body.Close()
return resp.StatusCode, latency, nil
}
}
dc.GetTelemetryPreview = func() ([]report.AppTelemetry, error) {
return report.BuildAppTelemetryForDebug(stackMgr, metricsStore, logger), nil
}
webServer.SetDebugCallbacks(dc)
}
// Drive migration (full-drive move) has moved to the host agent (slice 8C);
// the controller no longer runs a DriveMigrator.
// --- Build HTTP mux ---
mux := http.NewServeMux()
// API routes (no auth for health endpoint, auth for everything else)
mux.HandleFunc("/api/health", apiRouter.HealthHandler)
// Disk management API — thin proxy to the host agent (slice 8C). The agent owns
// disk execution; the controller forwards list/assign/eject/format.
mux.Handle("/api/disks", webServer.RequireAuth(webServer.CsrfProtect(http.HandlerFunc(webServer.ServeDiskAPI))))
mux.Handle("/api/disks/", webServer.RequireAuth(webServer.CsrfProtect(http.HandlerFunc(webServer.ServeDiskAPI))))
// Guided storage provisioning (init/attach/eject orchestration over the agent disk API + registry).
mux.Handle("/api/storage/", webServer.RequireAuth(webServer.CsrfProtect(http.HandlerFunc(webServer.ServeStorageAPI))))
// Whole-guest (appliance) backup visibility + manual trigger. Distinct prefix from apiRouter's
// app-data /api/backup/{run,status} (DB dumps) to avoid shadowing the /api/ catch-all subtree.
mux.Handle("/api/guest-backup/", webServer.RequireAuth(webServer.CsrfProtect(http.HandlerFunc(webServer.ServeBackupAPI))))
// Host metrics API — thin proxy to the host agent (slice 9). Read-only host-wide health +
// per-storage capacity for the monitoring view; the de-privileged controller can't read the
// host itself. GET only, so no CSRF wrapper needed.
mux.Handle("/api/host-metrics", webServer.RequireAuth(http.HandlerFunc(webServer.ServeHostMetricsAPI)))
// App export/import API routes handled by web server
mux.Handle("/api/export/", webServer.RequireAuth(webServer.CsrfProtect(http.HandlerFunc(webServer.ServeExportAPI))))
// Debug API routes handled by web server (debug-mode gating inside handler)
mux.Handle("/api/debug/", webServer.RequireAuth(webServer.CsrfProtect(http.HandlerFunc(webServer.ServeDebugAPI))))
// Self-update API — accepts session auth OR hub API key (for external triggering)
// CsrfProtect exempts Bearer-token requests automatically.
mux.Handle("/api/selfupdate/", selfUpdateAuthMiddleware(cfg, webServer, webServer.CsrfProtect(http.HandlerFunc(apiRouter.ServeHTTP))))
// Config API — accepts session auth OR hub API key (for Hub config push)
mux.Handle("/api/config/", selfUpdateAuthMiddleware(cfg, webServer, webServer.CsrfProtect(http.HandlerFunc(apiRouter.ServeHTTP))))
// Geo API — accepts session auth OR hub API key (for Hub geo-disable)
mux.Handle("/api/geo/", selfUpdateAuthMiddleware(cfg, webServer, webServer.CsrfProtect(http.HandlerFunc(apiRouter.ServeHTTP))))
mux.Handle("/api/", webServer.RequireAuth(webServer.CsrfProtect(http.HandlerFunc(apiRouter.ServeHTTP))))
// Web UI routes (auth required)
mux.Handle("/", webServer.RequireAuth(webServer.CsrfProtect(http.HandlerFunc(webServer.ServeHTTP))))
// --- Start HTTP server ---
server := &http.Server{
Addr: cfg.Web.Listen,
Handler: webServer.CatchAllMiddleware(mux),
ReadTimeout: 30 * time.Second,
WriteTimeout: 60 * time.Second,
IdleTimeout: 120 * time.Second,
}
sigCh := make(chan os.Signal, 1)
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
go func() {
sig := <-sigCh
logger.Printf("[INFO] Received signal %v, shutting down...", sig)
cancel()
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 15*time.Second)
defer shutdownCancel()
if err := server.Shutdown(shutdownCtx); err != nil {
logger.Printf("[ERROR] HTTP server shutdown error: %v", err)
}
}()
logger.Printf("[INFO] Web UI listening on %s", cfg.Web.Listen)
if err := server.ListenAndServe(); err != http.ErrServerClosed {
logger.Fatalf("[FATAL] HTTP server error: %v", err)
}
logger.Println("[INFO] felhom-controller stopped")
}
// selfUpdateAuthMiddleware allows access via session auth (normal UI) OR hub API key bearer token (external).
func selfUpdateAuthMiddleware(cfg *config.Config, webServer *web.Server, next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Check bearer token first (for external API calls: hub, build scripts)
if auth := r.Header.Get("Authorization"); strings.HasPrefix(auth, "Bearer ") {
token := strings.TrimPrefix(auth, "Bearer ")
if token != "" && cfg.Hub.APIKey != "" && subtle.ConstantTimeCompare([]byte(token), []byte(cfg.Hub.APIKey)) == 1 {
next.ServeHTTP(w, r)
return
}
}
// Fall back to session auth
webServer.RequireAuth(next).ServeHTTP(w, r)
})
}
func setupLogger(cfg *config.Config) (*log.Logger, *web.LogBuffer) {
if cfg.Logging.Level == "debug" {
logBuffer := web.NewLogBuffer(1000)
logger := log.New(io.MultiWriter(os.Stdout, logBuffer), "", log.LstdFlags|log.Lshortfile)
return logger, logBuffer
}
return log.New(os.Stdout, "", log.LstdFlags), nil
}
// stackAdapter implements backup.StackDataProvider using stacks.Manager.
type stackAdapter struct {
mgr *stacks.Manager
getStoragePaths func() []settings.StoragePath
encKey []byte // for decrypting live app.yaml secrets during restore-from-unit
}
func (a *stackAdapter) GetStackComposePath(name string) (string, bool) {
s, ok := a.mgr.GetStack(name)
if !ok {
return "", false
}
return s.ComposePath, true
}
func (a *stackAdapter) ListDeployedStacks() []backup.StackSummary {
var result []backup.StackSummary
for _, s := range a.mgr.GetStacks() {
if !s.Deployed {
continue
}
result = append(result, backup.StackSummary{
Name: s.Name,
DisplayName: s.Meta.DisplayName,
ComposePath: s.ComposePath,
NeedsHDD: s.Meta.Resources.NeedsHDD,
HasVolumes: len(backup.ParseComposeNamedVolumes(s.ComposePath)) > 0,
})
}
return result
}
func (a *stackAdapter) StopStack(name string) error {
return a.mgr.StopStack(name)
}
func (a *stackAdapter) StartStack(name string) error {
return a.mgr.StartStack(name)
}
func (a *stackAdapter) GetStackHDDMounts(name string) []string {
s, ok := a.mgr.GetStack(name)
if !ok {
return nil
}
// Priority 1: Read the app's own HDD_PATH from its app.yaml
stackDir := filepath.Dir(s.ComposePath)
appCfg := stacks.LoadAppConfig(stackDir)
if appCfg != nil && appCfg.Env["HDD_PATH"] != "" {
return stacks.ParseComposeHDDMounts(s.ComposePath, appCfg.Env["HDD_PATH"])
}
// Priority 2: Try all registered storage paths (fallback)
var allMounts []string
seen := make(map[string]bool)
for _, sp := range a.getStoragePaths() {
mounts := stacks.ParseComposeHDDMounts(s.ComposePath, sp.Path)
for _, m := range mounts {
if !seen[m] {
seen[m] = true
allMounts = append(allMounts, m)
}
}
}
return allMounts
}
func (a *stackAdapter) GetDockerVolumes(name string) []string {
s, ok := a.mgr.GetStack(name)
if !ok {
return nil
}
return backup.ResolveDockerVolumeNames(s.ComposePath)
}
func (a *stackAdapter) GetStackHDDPath(name string) string {
s, ok := a.mgr.GetStack(name)
if !ok {
return ""
}
stackDir := filepath.Dir(s.ComposePath)
appCfg := stacks.LoadAppConfig(stackDir)
if appCfg != nil && appCfg.Env["HDD_PATH"] != "" {
return filepath.Clean(appCfg.Env["HDD_PATH"])
}
return ""
}
// GetStackRecoveryInfo gathers the SECRET-FREE inputs for an app's recovery unit (Phase 2): the
// stack dir, pinned image tags, the non-secret env, and the NAMES of secret/data-key env vars.
// It deliberately does NOT decrypt or return any secret value — secret/password fields are stored
// encrypted in app.yaml, so excluding them (plus a defensive crypto.IsEncrypted guard) yields a
// plaintext, secret-free env. The actual secret values are recovered at restore time from the
// guest's own app.yaml (live, or via the PBS whole-guest snapshot), never from the unit.
func (a *stackAdapter) GetStackRecoveryInfo(name string) (backup.RecoveryInfo, bool) {
s, ok := a.mgr.GetStack(name)
if !ok {
return backup.RecoveryInfo{}, false
}
stackDir := filepath.Dir(s.ComposePath)
meta := stacks.LoadMetadata(stackDir)
// Secret set = all secret/password fields any data_key fields (in deterministic metadata order).
secretSet := make(map[string]bool)
var secretNames []string
add := func(v string) {
if !secretSet[v] {
secretSet[v] = true
secretNames = append(secretNames, v)
}
}
for _, v := range stacks.SensitiveEnvVars(&meta) {
add(v)
}
dataKeys := meta.DataKeyEnvVars()
for _, v := range dataKeys {
add(v)
}
// Non-secret env: raw app.yaml values that are neither named-secret nor (defensively) encrypted.
nonSecret := make(map[string]string)
if appCfg := stacks.LoadAppConfig(stackDir); appCfg != nil {
for k, v := range appCfg.Env {
if secretSet[k] || crypto.IsEncrypted(v) {
continue
}
nonSecret[k] = v
}
}
return backup.RecoveryInfo{
StackDir: stackDir,
DisplayName: s.Meta.DisplayName,
ImagePins: backup.ParseComposeImages(s.ComposePath),
NonSecretEnv: nonSecret,
SecretEnvVars: secretNames,
DataKeyEnvVars: dataKeys,
}, true
}
// RecoverStackSecrets returns the live decrypted values for the named secret env vars present in the
// stack's app.yaml (the guest's own — live rootfs or PBS-restored). Absent/empty names are omitted;
// the caller's fail-closed gate decides. Secrets come from the guest, never from the recovery unit.
func (a *stackAdapter) RecoverStackSecrets(name string, names []string) map[string]string {
s, ok := a.mgr.GetStack(name)
if !ok {
return nil
}
cfg := stacks.LoadAppConfigDecrypted(filepath.Dir(s.ComposePath), a.encKey)
if cfg == nil {
return nil
}
out := make(map[string]string)
for _, n := range names {
if v, ok := cfg.Env[n]; ok && v != "" {
out[n] = v
}
}
return out
}
// RecreateStackFromUnit restores the app definition from the unit's compose dir into the stack dir,
// then redeploys with the reconstructed full env (re-pulling the pinned image). Secrets in fullEnv were
// recovered from the guest, never regenerated.
func (a *stackAdapter) RecreateStackFromUnit(name, composeSrcDir string, fullEnv map[string]string) error {
s, ok := a.mgr.GetStack(name)
if !ok {
return fmt.Errorf("stack %q not found", name)
}
stackDir := filepath.Dir(s.ComposePath)
// Recover the app definition from the unit (compose + .felhom.yml) into the stack dir.
for _, fname := range []string{"docker-compose.yml", ".felhom.yml"} {
data, err := os.ReadFile(filepath.Join(composeSrcDir, fname))
if err != nil {
continue // capture whichever existed
}
if err := os.WriteFile(filepath.Join(stackDir, fname), data, 0644); err != nil {
return fmt.Errorf("restoring %s from unit: %w", fname, err)
}
}
return a.mgr.RedeployFromEnv(name, fullEnv)
}
// RefreshAndIsRunning forces a docker ps scan before checking state.
// Called during post-restore health check (~every 5s for up to 90s).
// Full refresh is acceptable here since restores are rare operations.
func (a *stackAdapter) RefreshAndIsRunning(name string) bool {
a.mgr.RefreshStatus()
s, ok := a.mgr.GetStack(name)
return ok && s.State == stacks.StateRunning
}
// integrationStackAdapter implements integrations.StackProvider using stacks.Manager.
type integrationStackAdapter struct {
mgr *stacks.Manager
}
func (a *integrationStackAdapter) GetStack(name string) (*stacks.Stack, bool) {
return a.mgr.GetStack(name)
}
func (a *integrationStackAdapter) GetStacks() []stacks.Stack {
return a.mgr.GetStacks()
}
func (a *integrationStackAdapter) RestartStack(name string) error {
return a.mgr.RestartStack(name)
}
// geoStackAdapter implements cloudflare.StackLister for geo-restriction sync.
type geoStackAdapter struct {
mgr *stacks.Manager
domain string
}
func (a *geoStackAdapter) GetDeployedHostnames() map[string]string {
result := make(map[string]string)
for _, stack := range a.mgr.GetStacks() {
if !stack.Deployed {
continue
}
subdomain := stack.Meta.Subdomain
// Check for custom subdomain in app.yaml
if appCfg := a.mgr.LoadAppConfigByName(stack.Name); appCfg != nil {
if sd, ok := appCfg.Env["SUBDOMAIN"]; ok && sd != "" {
subdomain = sd
}
}
if subdomain != "" {
result[stack.Name] = subdomain + "." + a.domain
}
}
return result
}
// exportAdapter implements appexport.ExportStackProvider using stacks.Manager.
type exportAdapter struct {
mgr *stacks.Manager
encKey []byte
}
func (a *exportAdapter) GetStackDir(name string) (string, bool) {
s, ok := a.mgr.GetStack(name)
if !ok {
return "", false
}
return filepath.Dir(s.ComposePath), true
}
func (a *exportAdapter) GetStackComposePath(name string) (string, bool) {
s, ok := a.mgr.GetStack(name)
if !ok {
return "", false
}
return s.ComposePath, true
}
func (a *exportAdapter) GetStackHDDMounts(name string) []string {
s, ok := a.mgr.GetStack(name)
if !ok {
return nil
}
stackDir := filepath.Dir(s.ComposePath)
appCfg := stacks.LoadAppConfig(stackDir)
if appCfg != nil && appCfg.Env["HDD_PATH"] != "" {
return stacks.ParseComposeHDDMounts(s.ComposePath, appCfg.Env["HDD_PATH"])
}
return nil
}
func (a *exportAdapter) GetStackHDDPath(name string) string {
s, ok := a.mgr.GetStack(name)
if !ok {
return ""
}
stackDir := filepath.Dir(s.ComposePath)
appCfg := stacks.LoadAppConfig(stackDir)
if appCfg != nil && appCfg.Env["HDD_PATH"] != "" {
return filepath.Clean(appCfg.Env["HDD_PATH"])
}
return ""
}
func (a *exportAdapter) IsStackRunning(name string) bool {
s, ok := a.mgr.GetStack(name)
return ok && s.State == stacks.StateRunning
}
func (a *exportAdapter) StopStack(name string) error {
return a.mgr.StopStack(name)
}
func (a *exportAdapter) StartStack(name string) error {
return a.mgr.StartStack(name)
}
func (a *exportAdapter) GetStackDisplayName(name string) string {
s, ok := a.mgr.GetStack(name)
if !ok {
return name
}
return s.Meta.DisplayName
}
func (a *exportAdapter) GetStackNeedsHDD(name string) bool {
s, ok := a.mgr.GetStack(name)
return ok && s.Meta.Resources.NeedsHDD
}
func (a *exportAdapter) GetDockerVolumes(name string) []string {
s, ok := a.mgr.GetStack(name)
if !ok {
return nil
}
return backup.ResolveDockerVolumeNames(s.ComposePath)
}
func (a *exportAdapter) IsStackDeployed(name string) bool {
s, ok := a.mgr.GetStack(name)
return ok && s.Deployed
}
func (a *exportAdapter) GetDecryptedEnv(name string) map[string]string {
s, ok := a.mgr.GetStack(name)
if !ok {
return nil
}
stackDir := filepath.Dir(s.ComposePath)
cfg := stacks.LoadAppConfigDecrypted(stackDir, a.encKey)
if cfg == nil {
return nil
}
return cfg.Env
}
func (a *exportAdapter) GetStacksBaseDir() string {
return a.mgr.GetStacksBaseDir()
}
func (a *exportAdapter) SaveEncryptedAppConfig(stackDir string, env map[string]string) error {
meta := stacks.LoadMetadata(stackDir)
sensitiveVars := stacks.SensitiveEnvVars(&meta)
cfg := &stacks.AppConfig{
Deployed: true,
DeployedAt: time.Now().Format(time.RFC3339),
Env: env,
}
return stacks.SaveAppConfig(stackDir, cfg, a.encKey, sensitiveVars)
}
func (a *exportAdapter) RefreshStacks() error {
return a.mgr.RefreshStatus()
}
func (a *exportAdapter) RemoveStackVolumes(name string) error {
s, ok := a.mgr.GetStack(name)
if !ok {
return fmt.Errorf("stack %q not found", name)
}
stackDir := filepath.Dir(s.ComposePath)
// Build env from decrypted app config
cmdEnv := os.Environ()
appCfg := stacks.LoadAppConfigDecrypted(stackDir, a.encKey)
if appCfg != nil {
for k, v := range appCfg.Env {
cmdEnv = append(cmdEnv, k+"="+v)
}
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
cmd := exec.CommandContext(ctx, "docker", "compose", "down", "--volumes")
cmd.Dir = stackDir
cmd.Env = cmdEnv
out, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("compose down --volumes: %s — %w", strings.TrimSpace(string(out)), err)
}
return nil
}
// fileExists returns true if the path exists (file or directory).
func fileExists(path string) bool {
_, err := os.Stat(path)
return err == nil
}
// quiesceBackend adapts *agentapi.Client to quiesce.Backend (bool/string, decoupled from the
// agentapi response structs).
type quiesceBackend struct{ c *agentapi.Client }
func (b quiesceBackend) Due(ctx context.Context) (bool, error) {
r, err := b.c.BackupDue(ctx)
return r.Due, err
}
func (b quiesceBackend) StartBackup(ctx context.Context) (string, error) {
r, err := b.c.StartBackup(ctx)
return r.JobID, err
}
func (b quiesceBackend) BackupStatus(ctx context.Context) (string, error) {
r, err := b.c.BackupStatus(ctx)
return r.Phase, err
}
// startQuiesceLoop wires + starts the slice-8B quiesce loop when the local API is configured and
// quiesce is enabled. It Recovers (restarts stacks left stopped by a mid-quiesce crash) before
// starting the loop goroutine. Non-fatal: any misconfig disables the loop with a log line.
func startQuiesceLoop(ctx context.Context, cfg *config.Config, stackMgr *stacks.Manager, logger *log.Logger) *quiesce.Loop {
if cfg.LocalAPI.Endpoint == "" || cfg.LocalAPI.Token == "" {
return nil // not a provisioned guest — no agent to back up against
}
if !cfg.Quiesce.QuiesceEnabled() {
logger.Printf("[INFO] [quiesce] disabled by config")
return nil
}
client, err := agentapi.New(cfg.LocalAPI.Endpoint, cfg.LocalAPI.Token, cfg.LocalAPI.Fingerprint)
if err != nil {
logger.Printf("[WARN] [quiesce] disabled (agent client init failed): %v", err)
return nil
}
poll := parseDurationOr(cfg.Quiesce.PollInterval, 5*time.Minute)
statusPoll := parseDurationOr(cfg.Quiesce.StatusPoll, 10*time.Second)
maxQuiesce := parseDurationOr(cfg.Quiesce.MaxQuiesce, 30*time.Minute)
loop := quiesce.New(quiesce.Options{
Backend: quiesceBackend{c: client},
Stacks: stackMgr,
MarkerPath: filepath.Join(cfg.Paths.DataDir, "quiesce-state.json"),
Poll: poll,
StatusPoll: statusPoll,
MaxQuiesce: maxQuiesce,
Logger: logger,
})
loop.Recover() // crash-safety: restart any stacks stranded-down by a mid-quiesce crash
go loop.Run(ctx)
return loop
}
// parseDurationOr parses a duration string, falling back to def on empty/invalid input.
func parseDurationOr(s string, def time.Duration) time.Duration {
if s == "" {
return def
}
d, err := time.ParseDuration(s)
if err != nil || d <= 0 {
return def
}
return d
}
// probeLocalAPI proves the controller↔agent local-API channel at startup and logs this guest's
// mounts (slice 8A). Non-fatal: it only runs when a local-API endpoint is configured, and any
// error is logged for diagnosis without affecting the controller's boot. The leaf SHA-256 from
// the bootstrap is pinned by the client (fails closed on mismatch).
func probeLocalAPI(cfg *config.Config, logger *log.Logger) {
if cfg.LocalAPI.Endpoint == "" || cfg.LocalAPI.Token == "" {
return
}
client, err := agentapi.New(cfg.LocalAPI.Endpoint, cfg.LocalAPI.Token, cfg.LocalAPI.Fingerprint)
if err != nil {
logger.Printf("[WARN] local-api: client init failed (%v) — channel not verified", err)
return
}
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
resp, err := client.Storage(ctx)
if err != nil {
logger.Printf("[WARN] local-api: GET /storage failed (%v) — channel not verified", err)
return
}
logger.Printf("[INFO] local-api: channel up (agent %s) — guest %d, %d mount(s) visible",
cfg.LocalAPI.Endpoint, resp.VMID, len(resp.Mounts))
for _, m := range resp.Mounts {
logger.Printf("[INFO] local-api: mount %s → %s (storage=%s, class=%s, backup=%v)",
m.Key, m.MountPoint, m.Storage, m.Class, m.Backup)
}
}
// runSetupMode starts the setup wizard on dual listeners and blocks until signal.
func runSetupMode(cfg *config.Config, logger *log.Logger) {
ips := setup.DetectLocalIPs()
setup.LogSetupMode(cfg.Customer.Domain, ips, cfg.Web.SetupListen, logger)
setupSrv := setup.NewServer(cfg, cfg.Paths.DataDir, logger, Version)
handler := setupSrv.Handler()
// Health endpoint wrapper (returns setup_mode: true)
healthHandler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]interface{}{
"ok": true, "message": "felhom-controller is healthy",
"setup_mode": true, "version": Version,
})
})
// Mux for both listeners
mux := http.NewServeMux()
mux.HandleFunc("/api/health", healthHandler)
mux.Handle("/", handler)
// Start main listener (:8080, behind Traefik for domain access)
mainServer := &http.Server{
Addr: cfg.Web.Listen,
Handler: mux,
ReadTimeout: 30 * time.Second,
WriteTimeout: 60 * time.Second,
IdleTimeout: 120 * time.Second,
}
go func() {
logger.Printf("[INFO] Setup wizard (main) listening on %s", cfg.Web.Listen)
if err := mainServer.ListenAndServe(); err != http.ErrServerClosed {
logger.Printf("[ERROR] Main HTTP server error: %v", err)
}
}()
// Start setup-only listener (:8081, direct HTTP for LAN access)
setupServer := &http.Server{
Addr: cfg.Web.SetupListen,
Handler: mux,
ReadTimeout: 30 * time.Second,
WriteTimeout: 60 * time.Second,
IdleTimeout: 120 * time.Second,
}
go func() {
logger.Printf("[INFO] Setup wizard (LAN) listening on %s", cfg.Web.SetupListen)
if err := setupServer.ListenAndServe(); err != http.ErrServerClosed {
logger.Printf("[ERROR] Setup HTTP server error: %v", err)
}
}()
// Wait for signal
sigCh := make(chan os.Signal, 1)
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
sig := <-sigCh
logger.Printf("[INFO] Received signal %v, shutting down setup wizard...", sig)
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 10*time.Second)
defer shutdownCancel()
mainServer.Shutdown(shutdownCtx)
setupServer.Shutdown(shutdownCtx)
logger.Println("[INFO] Setup wizard stopped")
}
// discoverHDDPaths scans deployed apps' app.yaml for HDD_PATH env values.
func discoverHDDPaths(stacksDir string, logger *log.Logger) []string {
entries, err := os.ReadDir(stacksDir)
if err != nil {
logger.Printf("[WARN] Cannot read stacks dir for HDD path discovery: %v", err)
return nil
}
seen := make(map[string]bool)
var paths []string
for _, e := range entries {
if !e.IsDir() {
continue
}
appCfg := stacks.LoadAppConfig(filepath.Join(stacksDir, e.Name()))
if appCfg == nil || !appCfg.Deployed {
continue
}
if hddPath, ok := appCfg.Env["HDD_PATH"]; ok && hddPath != "" {
cleaned := filepath.Clean(hddPath)
if !seen[cleaned] {
seen[cleaned] = true
paths = append(paths, cleaned)
}
}
}
return paths
}