7863e62f29
Restore recreates an app from its on-drive unit + the guest's own secrets, regenerating nothing. reconcileRestoreSecrets (pure, unit-tested) merges the unit's non-secret env with secrets recovered from the live app.yaml and FAILS CLOSED if a data-encrypting key is unrecoverable (refuse — a PBS whole-guest restore is needed — rather than regenerate and corrupt). Resettable secrets missing → warn + proceed. - backup: RestoreFromRecoveryUnit (manifest -> recover secrets -> gate -> restore volumes -> recreate definition + redeploy w/ re-pull); falls back to volume-only. - seams: RecoverStackSecrets/RecreateStackFromUnit (adapter +encKey), stacks.RedeployFromEnv. Wired into /backup/restore. - tests: gate (refuse/proceed/verbatim) + data_key parsing. Gate + reconcile + data_key parsing unit-tested; capture live-validated (v0.53.1). Full readable-data e2e vs AdventureLog needs the auth-gated dashboard restore — pending. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1339 lines
47 KiB
Go
1339 lines
47 KiB
Go
package main
|
||
|
||
import (
|
||
"context"
|
||
"encoding/json"
|
||
"errors"
|
||
"flag"
|
||
"fmt"
|
||
"io"
|
||
"log"
|
||
"net/http"
|
||
"os"
|
||
"os/exec"
|
||
"os/signal"
|
||
"path/filepath"
|
||
"syscall"
|
||
"time"
|
||
|
||
"crypto/subtle"
|
||
"strings"
|
||
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/agentapi"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/api"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/appexport"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/assets"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/backup"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/bootstrap"
|
||
cf "gitea.dooplex.hu/admin/felhom-controller/internal/cloudflare"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/crypto"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/integrations"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/metrics"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/notify"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/quiesce"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/recovery"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/report"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/scheduler"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/selftest"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/selfupdate"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/settings"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/setup"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/stacks"
|
||
catalogsync "gitea.dooplex.hu/admin/felhom-controller/internal/sync"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
|
||
"gitea.dooplex.hu/admin/felhom-controller/internal/web"
|
||
)
|
||
|
||
var (
|
||
// Set at build time via ldflags
|
||
Version = "dev"
|
||
BuildTime = "unknown"
|
||
GitCommit = "unknown"
|
||
)
|
||
|
||
func main() {
|
||
configPath := flag.String("config", "/opt/docker/felhom-controller/controller.yaml", "Path to configuration file")
|
||
showVersion := flag.Bool("version", false, "Show version and exit")
|
||
flag.Parse()
|
||
|
||
if *showVersion {
|
||
fmt.Printf("felhom-controller %s (built %s, commit %s)\n", Version, BuildTime, GitCommit)
|
||
os.Exit(0)
|
||
}
|
||
|
||
startTime := time.Now()
|
||
|
||
// --- Load configuration ---
|
||
// Use LoadPermissive to tolerate minimal configs (e.g. only domain set by docker-setup.sh).
|
||
// If even that fails (file missing/unreadable), fall back to defaults.
|
||
cfg, err := config.LoadPermissive(*configPath)
|
||
if err != nil {
|
||
cfg = config.Default()
|
||
log.Printf("[WARN] Config load failed (%s), using defaults: %v", *configPath, err)
|
||
}
|
||
|
||
logger, logBuffer := setupLogger(cfg)
|
||
|
||
// --- Bootstrap ingestion (slice 8A → v0.40.0 onboarding, doc 03 §6) ---
|
||
// On first run, if this controller is not yet configured AND the host agent's provisioning
|
||
// back-half attached a bootstrap.json config mount, PULL the full controller.yaml from the hub
|
||
// (using the bootstrap's retrieval passphrase), merge in the per-guest local_api block, and come
|
||
// up CONFIGURED — skipping setup mode. Idempotent (never clobbers an existing controller.yaml)
|
||
// and fail-safe (a malformed/absent bootstrap, or a hub outage at first boot, leaves us in setup
|
||
// mode). The adapter marks a transient hub-unreachable error as retryable (the rest are permanent).
|
||
pull := func(hubURL, customerID, retrievalPassword string) (string, error) {
|
||
y, perr := report.PullConfig(hubURL, customerID, retrievalPassword)
|
||
if perr != nil && errors.Is(perr, report.ErrHubUnreachable) {
|
||
return "", fmt.Errorf("%w: %w", bootstrap.ErrPullTransient, perr)
|
||
}
|
||
return y, perr
|
||
}
|
||
cfg = bootstrap.MaybeIngest(*configPath, cfg, logger, pull)
|
||
|
||
// --- Wire system package debug logging ---
|
||
if cfg.Logging.Level == "debug" {
|
||
system.DebugLogger = logger
|
||
}
|
||
|
||
// --- Setup mode: if no customer ID configured, run setup wizard ---
|
||
if setup.NeedsSetup(cfg) {
|
||
logger.Printf("[INFO] felhom-controller %s — setup mode", Version)
|
||
runSetupMode(cfg, logger)
|
||
return
|
||
}
|
||
|
||
// --- Local API connectivity probe (slice 8A) ---
|
||
// When seeded with a local-API endpoint, prove the controller↔agent channel at startup and
|
||
// learn this guest's mounts (placement view). Non-fatal — the controller runs regardless; a
|
||
// failure is logged for diagnosis. The full /backup/due quiesce loop lands in 8B.
|
||
probeLocalAPI(cfg, logger)
|
||
|
||
logger.Printf("[INFO] felhom-controller %s starting (customer: %s, domain: %s)",
|
||
Version, cfg.Customer.ID, cfg.Customer.Domain)
|
||
|
||
// --- Load settings ---
|
||
settingsPath := cfg.Paths.DataDir + "/settings.json"
|
||
sett, err := settings.Load(settingsPath, logger)
|
||
if err != nil {
|
||
logger.Fatalf("[FATAL] Failed to load settings from %s: %v", settingsPath, err)
|
||
}
|
||
sett.SetDebug(cfg.Logging.Level == "debug")
|
||
|
||
// --- Auto-discover storage paths from deployed apps ---
|
||
discoveredPaths := discoverHDDPaths(cfg.Paths.StacksDir, logger)
|
||
sett.AutoDiscoverStoragePaths(discoveredPaths, cfg.Paths.HDDPath, logger)
|
||
|
||
// --- Load or create encryption key ---
|
||
encKeyPath := filepath.Join(cfg.Paths.DataDir, "encryption.key")
|
||
encKey, err := crypto.LoadOrCreateKey(encKeyPath)
|
||
if err != nil {
|
||
logger.Fatalf("[FATAL] Failed to load encryption key: %v", err)
|
||
}
|
||
logger.Printf("[INFO] Encryption key loaded from %s", encKeyPath)
|
||
|
||
// --- Initialize stack manager ---
|
||
stackMgr, err := stacks.NewManager(cfg, logger)
|
||
if err != nil {
|
||
logger.Fatalf("[FATAL] Failed to initialize stack manager: %v", err)
|
||
}
|
||
stackMgr.SetEncryptionKey(encKey)
|
||
|
||
// Initial stack scan
|
||
if err := stackMgr.ScanStacks(); err != nil {
|
||
logger.Printf("[WARN] Initial stack scan failed: %v", err)
|
||
}
|
||
|
||
// Inject missing deploy fields for all deployed stacks on startup
|
||
if names := stackMgr.DeployedStackNames(); len(names) > 0 {
|
||
stackMgr.InjectMissingFields(names)
|
||
}
|
||
|
||
// Migrate existing plaintext passwords to encrypted
|
||
stackMgr.MigrateEncryption()
|
||
|
||
// --- First-boot base-infrastructure bring-up ---
|
||
// We are guaranteed configured here (setup.NeedsSetup returned false above), so deploy the base
|
||
// stack (traefik-public network → traefik → cloudflared → filebrowser) the controller needs for
|
||
// routing + external access. Runs in a goroutine so a slow first-boot image pull never delays the
|
||
// web server; non-fatal (idempotent + single-flight, the health loop re-attempts each tick).
|
||
go func() {
|
||
if err := stackMgr.EnsureBaseStack(); err != nil {
|
||
logger.Printf("[WARN] [infra] first-boot base-stack bring-up: %v", err)
|
||
}
|
||
}()
|
||
|
||
// --- Initialize catalog syncer ---
|
||
syncer := catalogsync.New(cfg, logger, stackMgr.ScanStacks, func(updated []string) {
|
||
stackMgr.InjectMissingFields(updated)
|
||
})
|
||
syncer.Start()
|
||
defer syncer.Stop()
|
||
|
||
// --- Graceful shutdown context ---
|
||
ctx, cancel := context.WithCancel(context.Background())
|
||
defer cancel()
|
||
|
||
// --- Quiesce loop (slice 8B): app-consistent backup around the agent vzdump ---
|
||
// Runs only when the local API is configured (a provisioned guest) and quiesce is enabled.
|
||
// Recover FIRST (restart any stacks left stopped by a crash mid-quiesce), then start the loop.
|
||
quiesceLoop := startQuiesceLoop(ctx, cfg, stackMgr, logger)
|
||
|
||
// --- Start CPU collector ---
|
||
cpuCollector := system.NewCPUCollector(5 * time.Second)
|
||
cpuCollector.Start(ctx)
|
||
defer cpuCollector.Stop()
|
||
|
||
// --- Initialize metrics store + collector ---
|
||
metricsDBPath := "/opt/docker/felhom-controller/data/metrics.db"
|
||
metricsStore, err := metrics.NewMetricsStore(metricsDBPath, logger)
|
||
if err != nil {
|
||
logger.Printf("[WARN] Failed to initialize metrics store: %v — monitoring disabled", err)
|
||
} else {
|
||
logger.Printf("[INFO] Metrics store opened at %s", metricsDBPath)
|
||
}
|
||
|
||
if metricsStore != nil {
|
||
defer metricsStore.Close()
|
||
metricsHDDPath := cfg.Paths.HDDPath
|
||
if p := sett.GetDefaultStoragePath(); p != "" {
|
||
metricsHDDPath = p
|
||
}
|
||
metricsCollector := metrics.NewMetricsCollector(metricsStore, cpuCollector, metricsHDDPath, logger)
|
||
metricsCollector.Start(ctx)
|
||
defer metricsCollector.Stop()
|
||
logger.Println("[INFO] Metrics collector started (60s interval)")
|
||
}
|
||
|
||
// Deprecation notice for ping UUIDs (Healthchecks pinging retired — the Hub
|
||
// now owns monitoring; disk-tier backup moved to the host agent in slice 8C).
|
||
uuids := cfg.Monitoring.PingUUIDs
|
||
if uuids.Heartbeat != "" || uuids.SystemHealth != "" || uuids.DBDump != "" || uuids.Backup != "" || uuids.BackupIntegrity != "" {
|
||
logger.Println("[INFO] Healthchecks ping UUIDs configured but no longer used — monitoring is now handled by the Hub")
|
||
}
|
||
|
||
// --- Initialize backup manager (app-data only: DB dumps + Docker-volume tars) ---
|
||
var backupMgr *backup.Manager
|
||
stackProv := &stackAdapter{
|
||
mgr: stackMgr,
|
||
getStoragePaths: func() []settings.StoragePath { return sett.GetStoragePaths() },
|
||
encKey: encKey,
|
||
}
|
||
if cfg.Backup.Enabled {
|
||
backupMgr = backup.NewManager(cfg, sett, logger)
|
||
backupMgr.SetStackProvider(stackProv)
|
||
backupMgr.SetVersion(Version)
|
||
}
|
||
|
||
// --- Initialize alert manager ---
|
||
alertMgr := web.NewAlertManager(logger)
|
||
|
||
// --- Initialize notifier ---
|
||
notifier := notify.New(cfg.Hub.URL, cfg.Hub.APIKey, cfg.Customer.ID, sett, logger, cfg.Logging.Level == "debug")
|
||
|
||
// --- Initialize self-updater ---
|
||
var updater *selfupdate.Updater
|
||
if cfg.SelfUpdate.Enabled {
|
||
composePath := filepath.Join(filepath.Dir(cfg.Paths.DataDir), "docker-compose.yml")
|
||
updater = selfupdate.NewUpdater(&cfg.SelfUpdate, &cfg.Git, Version, cfg.Paths.DataDir, composePath, logger, cfg.Logging.Level == "debug")
|
||
updater.SetBackupRunningCheck(func() bool {
|
||
return backupMgr != nil && backupMgr.IsRunning()
|
||
})
|
||
// Check for post-update state (did a previous update succeed or fail?)
|
||
if state := updater.VerifyStartup(); state != nil {
|
||
notifier.NotifyControllerUpdated(state.PreviousVersion, state.TargetVersion, state.Status == "success")
|
||
}
|
||
logger.Printf("[INFO] Self-update enabled (check every %s, auto-update: %v, auto-update time: %s)",
|
||
cfg.SelfUpdate.CheckInterval, cfg.SelfUpdate.AutoUpdate, cfg.SelfUpdate.AutoUpdateTime)
|
||
}
|
||
|
||
// --- Initialize scheduler ---
|
||
sched := scheduler.New(logger)
|
||
sched.SetDebug(cfg.Logging.Level == "debug")
|
||
|
||
// Existing periodic tasks (migrated from ad-hoc goroutines)
|
||
sched.Every("status-refresh", 30*time.Second, func(ctx context.Context) error {
|
||
return stackMgr.RefreshStatus()
|
||
})
|
||
sched.Every("stack-scan", 2*time.Minute, func(ctx context.Context) error {
|
||
return stackMgr.ScanStacks()
|
||
})
|
||
sched.Every("health-probes", 10*time.Second, func(ctx context.Context) error {
|
||
return stackMgr.RunHealthProbes()
|
||
})
|
||
|
||
// System health check — refreshes dashboard alerts and notifies on changes.
|
||
// Healthchecks.io pinging has been retired (the Hub now owns monitoring).
|
||
healthInterval, err := time.ParseDuration(cfg.Monitoring.SystemHealthInterval)
|
||
if err != nil {
|
||
healthInterval = 5 * time.Minute
|
||
}
|
||
sched.Every("system-health", healthInterval, func(ctx context.Context) error {
|
||
healthReport := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths(), logger)
|
||
// Self-heal the base stack: call unconditionally every tick. EnsureBaseStack is single-flight
|
||
// + idempotent (skips running stacks ⇒ a cheap 3× docker-inspect no-op when healthy), so there
|
||
// is no need to couple to the health-report issue strings. Runs in a goroutine — never blocks
|
||
// or fails the health job.
|
||
go func() {
|
||
if err := stackMgr.EnsureBaseStack(); err != nil {
|
||
logger.Printf("[WARN] [infra] self-heal base-stack bring-up: %v", err)
|
||
}
|
||
}()
|
||
// Refresh dashboard alerts from health report
|
||
updateAvailable := false
|
||
latestVersion := ""
|
||
if updater != nil {
|
||
status := updater.GetStatus()
|
||
if status.LastCheck != nil {
|
||
updateAvailable = status.LastCheck.UpdateAvailable
|
||
latestVersion = status.LastCheck.LatestVersion
|
||
}
|
||
}
|
||
alertMgr.Refresh(healthReport, cfg, backupMgr, updateAvailable, latestVersion, sett.GetStoragePaths())
|
||
// Notify on health status changes
|
||
notifier.NotifyHealthChange(healthReport.Status, healthReport.Issues, healthReport.Warnings)
|
||
return nil
|
||
})
|
||
|
||
// --- Central hub pusher (declared early so backup closure can reference it) ---
|
||
var hubPusher *report.Pusher
|
||
if cfg.Hub.URL != "" && cfg.Hub.APIKey != "" {
|
||
hubPusher = report.NewPusher(&cfg.Hub, logger, cfg.Logging.Level == "debug")
|
||
// Wire hub verification: update settings when hub reports customer status
|
||
hubPusher.OnPushResponse = func(resp *report.PushResponse) {
|
||
if resp.CustomerBlocked {
|
||
sett.SetHubVerified(false, time.Now())
|
||
logger.Printf("[WARN] Customer blocked on Hub — new deployments may be restricted")
|
||
} else {
|
||
sett.SetHubVerified(true, time.Now())
|
||
}
|
||
}
|
||
// Wire hub push status into alert manager for dashboard alerts
|
||
alertMgr.SetHubPushStatus(func() web.HubPushStatusData {
|
||
s := hubPusher.GetStatus()
|
||
return web.HubPushStatusData{
|
||
LastAttempt: s.LastAttempt,
|
||
LastSuccess: s.LastSuccess,
|
||
LastError: s.LastError,
|
||
Consecutive: s.Consecutive,
|
||
}
|
||
})
|
||
}
|
||
|
||
// Backup daily jobs
|
||
if cfg.Backup.Enabled && backupMgr != nil {
|
||
// App-data backup: daily database dumps. Disk-tier (restic snapshots,
|
||
// cross-drive, integrity check, infra backup) has moved to the host agent.
|
||
sched.Daily("db-dump", cfg.Backup.DBDumpSchedule, func(ctx context.Context) error {
|
||
err := backupMgr.RunDBDumps(ctx)
|
||
if err != nil {
|
||
notifier.NotifyDBDumpFailed("Adatbázis mentés sikertelen", err.Error())
|
||
} else {
|
||
notifier.NotifyDBDumpCompleted(notify.DBDumpDetails{})
|
||
}
|
||
return err
|
||
})
|
||
|
||
// Cache refresh: every 5 minutes
|
||
sched.Every("backup-cache", 5*time.Minute, func(ctx context.Context) error {
|
||
nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule)
|
||
backupMgr.RefreshCache(nextDBDump)
|
||
return nil
|
||
})
|
||
}
|
||
|
||
// Metrics prune — daily at 04:00
|
||
if metricsStore != nil {
|
||
sched.Daily("metrics-prune", "04:00", func(ctx context.Context) error {
|
||
deleted, err := metricsStore.Prune(30 * 24 * time.Hour)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
logger.Printf("[INFO] Pruned %d old metric rows", deleted)
|
||
return nil
|
||
})
|
||
}
|
||
|
||
// --- Central hub reporting schedule ---
|
||
if hubPusher != nil {
|
||
if cfg.Hub.Enabled {
|
||
pushInterval, err := time.ParseDuration(cfg.Hub.PushInterval)
|
||
if err != nil {
|
||
pushInterval = 15 * time.Minute
|
||
}
|
||
sched.Every("hub-report", pushInterval, func(ctx context.Context) error {
|
||
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths(), sett.GetGeoRestriction(), logger)
|
||
if err := hubPusher.Push(r); err != nil {
|
||
return err
|
||
}
|
||
// Drain pending events (e.g., DR recovery completed) after successful push
|
||
if events := sett.DrainPendingEvents(); len(events) > 0 {
|
||
for _, ev := range events {
|
||
notifier.Notify(ev.EventType, ev.Severity, ev.Message, ev.Details)
|
||
}
|
||
logger.Printf("[INFO] Drained %d pending events to Hub", len(events))
|
||
}
|
||
return nil
|
||
})
|
||
logger.Printf("[INFO] Hub reporting enabled (every %s to %s)", pushInterval, cfg.Hub.URL)
|
||
} else {
|
||
logger.Printf("[INFO] Hub reporting disabled — will send disabled notification to %s", cfg.Hub.URL)
|
||
}
|
||
}
|
||
|
||
// Self-update scheduler jobs
|
||
if cfg.SelfUpdate.Enabled && updater != nil {
|
||
// Periodic version check (populates UI, never triggers update)
|
||
checkInterval, ciErr := time.ParseDuration(cfg.SelfUpdate.CheckInterval)
|
||
if ciErr != nil {
|
||
checkInterval = 6 * time.Hour
|
||
}
|
||
sched.Every("selfupdate-check", checkInterval, func(ctx context.Context) error {
|
||
result := updater.CheckForUpdate()
|
||
if result.UpdateAvailable {
|
||
logger.Printf("[INFO] Update available: %s -> %s", result.CurrentVersion, result.LatestVersion)
|
||
}
|
||
return nil
|
||
})
|
||
|
||
// Auto-update (daily, fires after typical backup completion)
|
||
if cfg.SelfUpdate.AutoUpdate {
|
||
sched.Daily("selfupdate-auto", cfg.SelfUpdate.AutoUpdateTime, func(ctx context.Context) error {
|
||
result := updater.CheckForUpdate()
|
||
if !result.UpdateAvailable {
|
||
return nil
|
||
}
|
||
if err := updater.TriggerUpdate("auto"); err != nil {
|
||
logger.Printf("[WARN] Auto-update skipped: %v", err)
|
||
}
|
||
return nil
|
||
})
|
||
}
|
||
}
|
||
|
||
// Storage watchdog (disk disconnect/reconnect detection) has moved to the host
|
||
// agent (slice 8C) — the controller no longer owns disk-level monitoring.
|
||
|
||
// --- Asset syncer (download from Hub) ---
|
||
var assetsSyncer *assets.Syncer
|
||
if cfg.Hub.Enabled && cfg.Assets.SyncEnabled && cfg.Hub.URL != "" && cfg.Hub.APIKey != "" {
|
||
assetsDir := filepath.Join(cfg.Paths.DataDir, "assets")
|
||
assetsSyncer = assets.New(cfg.Hub.URL, cfg.Hub.APIKey, assetsDir, "/usr/share/felhom/assets", logger, cfg.Logging.Level == "debug")
|
||
go func() {
|
||
time.Sleep(10 * time.Second)
|
||
if err := assetsSyncer.Sync(ctx); err != nil {
|
||
logger.Printf("[WARN] Initial asset sync failed: %v", err)
|
||
}
|
||
}()
|
||
sched.Daily("asset-sync", cfg.Assets.SyncSchedule, func(ctx context.Context) error {
|
||
return assetsSyncer.Sync(ctx)
|
||
})
|
||
logger.Printf("[INFO] Asset sync enabled (daily at %s from Hub)", cfg.Assets.SyncSchedule)
|
||
}
|
||
|
||
// --- Startup self-test ---
|
||
selfTestResult := selftest.Run(cfg, sett, logger)
|
||
|
||
sched.Start(ctx)
|
||
defer sched.Stop()
|
||
|
||
// Generate recovery info file if retrieval password is set
|
||
if rp := sett.GetRetrievalPassword(); rp != "" {
|
||
go func() {
|
||
info := recovery.Info{
|
||
CustomerID: cfg.Customer.ID,
|
||
RetrievalPassword: rp,
|
||
HubURL: cfg.Hub.URL,
|
||
SupportEmail: "support@felhom.eu",
|
||
SupportURL: "https://felhom.eu/kapcsolat",
|
||
}
|
||
if err := recovery.GenerateRecoveryFile(info, Version, cfg.Paths.DataDir); err != nil {
|
||
logger.Printf("[WARN] Failed to generate recovery-info.txt: %v", err)
|
||
}
|
||
}()
|
||
}
|
||
|
||
// Fire startup pings + hub report immediately (don't wait for first scheduler tick)
|
||
go func() {
|
||
time.Sleep(5 * time.Second) // Let all subsystems fully initialize
|
||
|
||
// Push controller startup event to Hub
|
||
notifier.NotifyControllerStarted(Version, map[string]interface{}{
|
||
"selftest_pass": selfTestResult.Pass,
|
||
"selftest_warn": selfTestResult.Warn,
|
||
"selftest_fail": selfTestResult.Fail,
|
||
})
|
||
|
||
// Hub report
|
||
if hubPusher != nil {
|
||
if cfg.Hub.Enabled {
|
||
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths(), sett.GetGeoRestriction(), logger)
|
||
var pushErr error
|
||
for attempt := 1; attempt <= 3; attempt++ {
|
||
pushErr = hubPusher.Push(r)
|
||
if pushErr == nil {
|
||
logger.Println("[INFO] Startup hub report sent")
|
||
break
|
||
}
|
||
logger.Printf("[WARN] Startup hub report attempt %d/3 failed: %v", attempt, pushErr)
|
||
if attempt < 3 {
|
||
time.Sleep(15 * time.Second)
|
||
}
|
||
}
|
||
if pushErr != nil {
|
||
logger.Printf("[WARN] Startup hub report failed after 3 attempts — next scheduled push in %s", cfg.Hub.PushInterval)
|
||
}
|
||
} else {
|
||
// Send a minimal "disabled" notification so hub knows reporting is intentionally off
|
||
r := &report.Report{
|
||
Version: 1,
|
||
CustomerID: cfg.Customer.ID,
|
||
CustomerName: cfg.Customer.Name,
|
||
ControllerVersion: Version,
|
||
Timestamp: time.Now().UTC(),
|
||
ReportingDisabled: true,
|
||
Health: report.HealthReport{Status: "disabled", Issues: []string{}, Warnings: []string{}},
|
||
Stacks: report.StacksReport{Deployed: []string{}, Available: []string{}},
|
||
Containers: report.ContainerReport{List: []report.ContainerDetailReport{}},
|
||
}
|
||
hubPusher.PushOnce(r)
|
||
}
|
||
}
|
||
|
||
// Initial self-update check (so settings page shows version info quickly)
|
||
if updater != nil {
|
||
time.Sleep(25 * time.Second) // Additional delay after hub report
|
||
result := updater.CheckForUpdate()
|
||
if result.UpdateAvailable {
|
||
logger.Printf("[INFO] Startup: update available %s -> %s", result.CurrentVersion, result.LatestVersion)
|
||
} else if result.Error != "" {
|
||
logger.Printf("[DEBUG] Startup version check: %s", result.Error)
|
||
}
|
||
}
|
||
}()
|
||
|
||
// Initial backup cache population (don't block startup)
|
||
if cfg.Backup.Enabled && backupMgr != nil {
|
||
go func() {
|
||
nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule)
|
||
backupMgr.RefreshCache(nextDBDump)
|
||
}()
|
||
}
|
||
|
||
// Sync notification preferences to hub on startup (handles hub DB rebuild recovery)
|
||
if notifier.IsEnabled() {
|
||
go func() {
|
||
prefs := sett.GetNotificationPrefs()
|
||
if prefs.Email != "" {
|
||
if err := notifier.SyncPreferences(prefs.Email, prefs.EnabledEvents, prefs.CooldownHours); err != nil {
|
||
logger.Printf("[WARN] Failed to sync notification preferences on startup: %v", err)
|
||
}
|
||
}
|
||
}()
|
||
}
|
||
|
||
// Initial alert refresh (so alerts appear immediately, not after first 5min health check)
|
||
go func() {
|
||
report := monitor.RunHealthCheck(cfg, cpuCollector, sett.GetStoragePaths(), logger)
|
||
alertMgr.Refresh(report, cfg, backupMgr, false, "")
|
||
}()
|
||
|
||
// --- Initialize API router ---
|
||
apiRouter := api.NewRouter(cfg, *configPath, sett, stackMgr, syncer, cpuCollector, backupMgr, metricsStore, updater, notifier, logger)
|
||
if hubPusher != nil {
|
||
apiRouter.OnConfigApplied = func() {
|
||
// Infra backup push is now the host agent's responsibility; the controller
|
||
// only refreshes the Hub report after a config apply.
|
||
}
|
||
}
|
||
if assetsSyncer != nil {
|
||
apiRouter.SetAssetsSyncer(assetsSyncer)
|
||
}
|
||
|
||
// --- Initialize Cloudflare geo-restriction ---
|
||
var geoSync *cf.GeoSyncManager
|
||
if cfg.Infrastructure.CFAPIToken != "" {
|
||
cfClient := cf.New(cfg.Infrastructure.CFAPIToken, logger, cfg.Logging.Level == "debug")
|
||
geoStacks := &geoStackAdapter{mgr: stackMgr, domain: cfg.Customer.Domain}
|
||
geoSync = cf.NewGeoSyncManager(cfClient, sett, cfg.Customer.Domain, geoStacks, logger)
|
||
geoSync.SetDebug(cfg.Logging.Level == "debug")
|
||
apiRouter.SetGeoSync(geoSync)
|
||
|
||
// Re-sync geo rules when apps are deployed/removed
|
||
apiRouter.OnGeoRelevantChange = func() {
|
||
geo := sett.GetGeoRestriction()
|
||
if geo != nil && geo.Enabled {
|
||
if err := geoSync.Sync(context.Background()); err != nil {
|
||
logger.Printf("[WARN] Geo sync after app change failed: %v", err)
|
||
}
|
||
}
|
||
}
|
||
|
||
// Periodic verification every 6 hours
|
||
sched.Every("geo-verify", 6*time.Hour, func(ctx context.Context) error {
|
||
geo := sett.GetGeoRestriction()
|
||
if geo == nil || !geo.Enabled {
|
||
return nil
|
||
}
|
||
return geoSync.Sync(ctx)
|
||
})
|
||
|
||
// Initial sync (delayed, non-blocking)
|
||
go func() {
|
||
time.Sleep(15 * time.Second)
|
||
if geo := sett.GetGeoRestriction(); geo != nil && geo.Enabled {
|
||
if err := geoSync.Sync(context.Background()); err != nil {
|
||
logger.Printf("[WARN] Initial geo sync failed: %v", err)
|
||
}
|
||
}
|
||
}()
|
||
|
||
logger.Printf("[INFO] Geo-restriction support enabled (CF API token configured)")
|
||
}
|
||
|
||
// --- Initialize integration manager ---
|
||
integrationStacks := &integrationStackAdapter{mgr: stackMgr}
|
||
integrationMgr := integrations.NewManager(sett, integrationStacks, cfg.Customer.Domain, cfg.Paths.StacksDir, encKey, logger)
|
||
integrationMgr.SetDebug(cfg.Logging.Level == "debug")
|
||
apiRouter.SetIntegrationManager(integrationMgr)
|
||
|
||
// --- Initialize app exporter ---
|
||
exportProv := &exportAdapter{mgr: stackMgr, encKey: encKey}
|
||
appExporter := appexport.NewExporter(exportProv, logger, Version)
|
||
appExporter.SetDebug(cfg.Logging.Level == "debug")
|
||
apiRouter.SetDebug(cfg.Logging.Level == "debug")
|
||
|
||
// --- Initialize web server ---
|
||
webServer := web.NewServer(cfg, stackMgr, cpuCollector, backupMgr, sched, sett, alertMgr, notifier, updater, logger, Version)
|
||
webServer.SetEncryptionKey(encKey)
|
||
webServer.SetAppExporter(appExporter)
|
||
webServer.SetIntegrationManager(integrationMgr)
|
||
if quiesceLoop != nil {
|
||
webServer.SetBackupTrigger(quiesceLoop) // "Mentés most" → app-consistent backup via the quiesce loop
|
||
}
|
||
if assetsSyncer != nil {
|
||
webServer.SetAssetsSyncer(assetsSyncer)
|
||
}
|
||
if hubPusher != nil {
|
||
webServer.SetHubPushStatus(func() web.HubPushStatusData {
|
||
s := hubPusher.GetStatus()
|
||
return web.HubPushStatusData{
|
||
LastAttempt: s.LastAttempt,
|
||
LastSuccess: s.LastSuccess,
|
||
LastError: s.LastError,
|
||
Consecutive: s.Consecutive,
|
||
}
|
||
})
|
||
}
|
||
if logBuffer != nil {
|
||
webServer.SetLogBuffer(logBuffer)
|
||
}
|
||
webServer.SetStartTime(startTime)
|
||
|
||
// Wire debug callbacks (only in debug mode)
|
||
if cfg.Logging.Level == "debug" {
|
||
dc := &web.DebugCallbacks{}
|
||
if hubPusher != nil {
|
||
dc.TriggerHubReportPush = func() error {
|
||
r := report.BuildReport(cfg, *configPath, stackMgr, backupMgr, cpuCollector, metricsStore, Version, sett.GetStoragePaths(), sett.GetGeoRestriction(), logger)
|
||
return hubPusher.Push(r)
|
||
}
|
||
}
|
||
dc.HubConnectivityTest = func() (int, int64, error) {
|
||
start := time.Now()
|
||
client := &http.Client{Timeout: 10 * time.Second}
|
||
resp, err := client.Get(cfg.Hub.URL + "/healthz")
|
||
latency := time.Since(start).Milliseconds()
|
||
if err != nil {
|
||
return 0, latency, err
|
||
}
|
||
resp.Body.Close()
|
||
return resp.StatusCode, latency, nil
|
||
}
|
||
if cfg.Git.RepoURL != "" {
|
||
dc.GiteaConnectivityTest = func() (int, int64, error) {
|
||
start := time.Now()
|
||
client := &http.Client{Timeout: 5 * time.Second}
|
||
resp, err := client.Head(cfg.Git.RepoURL)
|
||
latency := time.Since(start).Milliseconds()
|
||
if err != nil {
|
||
return 0, latency, err
|
||
}
|
||
resp.Body.Close()
|
||
return resp.StatusCode, latency, nil
|
||
}
|
||
}
|
||
dc.GetTelemetryPreview = func() ([]report.AppTelemetry, error) {
|
||
return report.BuildAppTelemetryForDebug(stackMgr, metricsStore, logger), nil
|
||
}
|
||
webServer.SetDebugCallbacks(dc)
|
||
}
|
||
|
||
// Drive migration (full-drive move) has moved to the host agent (slice 8C);
|
||
// the controller no longer runs a DriveMigrator.
|
||
|
||
// --- Build HTTP mux ---
|
||
mux := http.NewServeMux()
|
||
|
||
// API routes (no auth for health endpoint, auth for everything else)
|
||
mux.HandleFunc("/api/health", apiRouter.HealthHandler)
|
||
// Disk management API — thin proxy to the host agent (slice 8C). The agent owns
|
||
// disk execution; the controller forwards list/assign/eject/format.
|
||
mux.Handle("/api/disks", webServer.RequireAuth(webServer.CsrfProtect(http.HandlerFunc(webServer.ServeDiskAPI))))
|
||
mux.Handle("/api/disks/", webServer.RequireAuth(webServer.CsrfProtect(http.HandlerFunc(webServer.ServeDiskAPI))))
|
||
// Guided storage provisioning (init/attach/eject orchestration over the agent disk API + registry).
|
||
mux.Handle("/api/storage/", webServer.RequireAuth(webServer.CsrfProtect(http.HandlerFunc(webServer.ServeStorageAPI))))
|
||
// Whole-guest (appliance) backup visibility + manual trigger. Distinct prefix from apiRouter's
|
||
// app-data /api/backup/{run,status} (DB dumps) to avoid shadowing the /api/ catch-all subtree.
|
||
mux.Handle("/api/guest-backup/", webServer.RequireAuth(webServer.CsrfProtect(http.HandlerFunc(webServer.ServeBackupAPI))))
|
||
// Host metrics API — thin proxy to the host agent (slice 9). Read-only host-wide health +
|
||
// per-storage capacity for the monitoring view; the de-privileged controller can't read the
|
||
// host itself. GET only, so no CSRF wrapper needed.
|
||
mux.Handle("/api/host-metrics", webServer.RequireAuth(http.HandlerFunc(webServer.ServeHostMetricsAPI)))
|
||
// App export/import API routes handled by web server
|
||
mux.Handle("/api/export/", webServer.RequireAuth(webServer.CsrfProtect(http.HandlerFunc(webServer.ServeExportAPI))))
|
||
// Debug API routes handled by web server (debug-mode gating inside handler)
|
||
mux.Handle("/api/debug/", webServer.RequireAuth(webServer.CsrfProtect(http.HandlerFunc(webServer.ServeDebugAPI))))
|
||
// Self-update API — accepts session auth OR hub API key (for external triggering)
|
||
// CsrfProtect exempts Bearer-token requests automatically.
|
||
mux.Handle("/api/selfupdate/", selfUpdateAuthMiddleware(cfg, webServer, webServer.CsrfProtect(http.HandlerFunc(apiRouter.ServeHTTP))))
|
||
// Config API — accepts session auth OR hub API key (for Hub config push)
|
||
mux.Handle("/api/config/", selfUpdateAuthMiddleware(cfg, webServer, webServer.CsrfProtect(http.HandlerFunc(apiRouter.ServeHTTP))))
|
||
// Geo API — accepts session auth OR hub API key (for Hub geo-disable)
|
||
mux.Handle("/api/geo/", selfUpdateAuthMiddleware(cfg, webServer, webServer.CsrfProtect(http.HandlerFunc(apiRouter.ServeHTTP))))
|
||
mux.Handle("/api/", webServer.RequireAuth(webServer.CsrfProtect(http.HandlerFunc(apiRouter.ServeHTTP))))
|
||
|
||
// Web UI routes (auth required)
|
||
mux.Handle("/", webServer.RequireAuth(webServer.CsrfProtect(http.HandlerFunc(webServer.ServeHTTP))))
|
||
|
||
// --- Start HTTP server ---
|
||
server := &http.Server{
|
||
Addr: cfg.Web.Listen,
|
||
Handler: webServer.CatchAllMiddleware(mux),
|
||
ReadTimeout: 30 * time.Second,
|
||
WriteTimeout: 60 * time.Second,
|
||
IdleTimeout: 120 * time.Second,
|
||
}
|
||
|
||
sigCh := make(chan os.Signal, 1)
|
||
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
|
||
|
||
go func() {
|
||
sig := <-sigCh
|
||
logger.Printf("[INFO] Received signal %v, shutting down...", sig)
|
||
cancel()
|
||
|
||
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 15*time.Second)
|
||
defer shutdownCancel()
|
||
|
||
if err := server.Shutdown(shutdownCtx); err != nil {
|
||
logger.Printf("[ERROR] HTTP server shutdown error: %v", err)
|
||
}
|
||
}()
|
||
|
||
logger.Printf("[INFO] Web UI listening on %s", cfg.Web.Listen)
|
||
if err := server.ListenAndServe(); err != http.ErrServerClosed {
|
||
logger.Fatalf("[FATAL] HTTP server error: %v", err)
|
||
}
|
||
|
||
logger.Println("[INFO] felhom-controller stopped")
|
||
}
|
||
|
||
// selfUpdateAuthMiddleware allows access via session auth (normal UI) OR hub API key bearer token (external).
|
||
func selfUpdateAuthMiddleware(cfg *config.Config, webServer *web.Server, next http.Handler) http.Handler {
|
||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||
// Check bearer token first (for external API calls: hub, build scripts)
|
||
if auth := r.Header.Get("Authorization"); strings.HasPrefix(auth, "Bearer ") {
|
||
token := strings.TrimPrefix(auth, "Bearer ")
|
||
if token != "" && cfg.Hub.APIKey != "" && subtle.ConstantTimeCompare([]byte(token), []byte(cfg.Hub.APIKey)) == 1 {
|
||
next.ServeHTTP(w, r)
|
||
return
|
||
}
|
||
}
|
||
// Fall back to session auth
|
||
webServer.RequireAuth(next).ServeHTTP(w, r)
|
||
})
|
||
}
|
||
|
||
func setupLogger(cfg *config.Config) (*log.Logger, *web.LogBuffer) {
|
||
if cfg.Logging.Level == "debug" {
|
||
logBuffer := web.NewLogBuffer(1000)
|
||
logger := log.New(io.MultiWriter(os.Stdout, logBuffer), "", log.LstdFlags|log.Lshortfile)
|
||
return logger, logBuffer
|
||
}
|
||
return log.New(os.Stdout, "", log.LstdFlags), nil
|
||
}
|
||
|
||
// stackAdapter implements backup.StackDataProvider using stacks.Manager.
|
||
type stackAdapter struct {
|
||
mgr *stacks.Manager
|
||
getStoragePaths func() []settings.StoragePath
|
||
encKey []byte // for decrypting live app.yaml secrets during restore-from-unit
|
||
}
|
||
|
||
func (a *stackAdapter) GetStackComposePath(name string) (string, bool) {
|
||
s, ok := a.mgr.GetStack(name)
|
||
if !ok {
|
||
return "", false
|
||
}
|
||
return s.ComposePath, true
|
||
}
|
||
|
||
func (a *stackAdapter) ListDeployedStacks() []backup.StackSummary {
|
||
var result []backup.StackSummary
|
||
for _, s := range a.mgr.GetStacks() {
|
||
if !s.Deployed {
|
||
continue
|
||
}
|
||
result = append(result, backup.StackSummary{
|
||
Name: s.Name,
|
||
DisplayName: s.Meta.DisplayName,
|
||
ComposePath: s.ComposePath,
|
||
NeedsHDD: s.Meta.Resources.NeedsHDD,
|
||
HasVolumes: len(backup.ParseComposeNamedVolumes(s.ComposePath)) > 0,
|
||
})
|
||
}
|
||
return result
|
||
}
|
||
|
||
func (a *stackAdapter) StopStack(name string) error {
|
||
return a.mgr.StopStack(name)
|
||
}
|
||
|
||
func (a *stackAdapter) StartStack(name string) error {
|
||
return a.mgr.StartStack(name)
|
||
}
|
||
|
||
func (a *stackAdapter) GetStackHDDMounts(name string) []string {
|
||
s, ok := a.mgr.GetStack(name)
|
||
if !ok {
|
||
return nil
|
||
}
|
||
|
||
// Priority 1: Read the app's own HDD_PATH from its app.yaml
|
||
stackDir := filepath.Dir(s.ComposePath)
|
||
appCfg := stacks.LoadAppConfig(stackDir)
|
||
if appCfg != nil && appCfg.Env["HDD_PATH"] != "" {
|
||
return stacks.ParseComposeHDDMounts(s.ComposePath, appCfg.Env["HDD_PATH"])
|
||
}
|
||
|
||
// Priority 2: Try all registered storage paths (fallback)
|
||
var allMounts []string
|
||
seen := make(map[string]bool)
|
||
for _, sp := range a.getStoragePaths() {
|
||
mounts := stacks.ParseComposeHDDMounts(s.ComposePath, sp.Path)
|
||
for _, m := range mounts {
|
||
if !seen[m] {
|
||
seen[m] = true
|
||
allMounts = append(allMounts, m)
|
||
}
|
||
}
|
||
}
|
||
return allMounts
|
||
}
|
||
|
||
func (a *stackAdapter) GetDockerVolumes(name string) []string {
|
||
s, ok := a.mgr.GetStack(name)
|
||
if !ok {
|
||
return nil
|
||
}
|
||
return backup.ResolveDockerVolumeNames(s.ComposePath)
|
||
}
|
||
|
||
func (a *stackAdapter) GetStackHDDPath(name string) string {
|
||
s, ok := a.mgr.GetStack(name)
|
||
if !ok {
|
||
return ""
|
||
}
|
||
stackDir := filepath.Dir(s.ComposePath)
|
||
appCfg := stacks.LoadAppConfig(stackDir)
|
||
if appCfg != nil && appCfg.Env["HDD_PATH"] != "" {
|
||
return filepath.Clean(appCfg.Env["HDD_PATH"])
|
||
}
|
||
return ""
|
||
}
|
||
|
||
// GetStackRecoveryInfo gathers the SECRET-FREE inputs for an app's recovery unit (Phase 2): the
|
||
// stack dir, pinned image tags, the non-secret env, and the NAMES of secret/data-key env vars.
|
||
// It deliberately does NOT decrypt or return any secret value — secret/password fields are stored
|
||
// encrypted in app.yaml, so excluding them (plus a defensive crypto.IsEncrypted guard) yields a
|
||
// plaintext, secret-free env. The actual secret values are recovered at restore time from the
|
||
// guest's own app.yaml (live, or via the PBS whole-guest snapshot), never from the unit.
|
||
func (a *stackAdapter) GetStackRecoveryInfo(name string) (backup.RecoveryInfo, bool) {
|
||
s, ok := a.mgr.GetStack(name)
|
||
if !ok {
|
||
return backup.RecoveryInfo{}, false
|
||
}
|
||
stackDir := filepath.Dir(s.ComposePath)
|
||
meta := stacks.LoadMetadata(stackDir)
|
||
|
||
// Secret set = all secret/password fields ∪ any data_key fields (in deterministic metadata order).
|
||
secretSet := make(map[string]bool)
|
||
var secretNames []string
|
||
add := func(v string) {
|
||
if !secretSet[v] {
|
||
secretSet[v] = true
|
||
secretNames = append(secretNames, v)
|
||
}
|
||
}
|
||
for _, v := range stacks.SensitiveEnvVars(&meta) {
|
||
add(v)
|
||
}
|
||
dataKeys := meta.DataKeyEnvVars()
|
||
for _, v := range dataKeys {
|
||
add(v)
|
||
}
|
||
|
||
// Non-secret env: raw app.yaml values that are neither named-secret nor (defensively) encrypted.
|
||
nonSecret := make(map[string]string)
|
||
if appCfg := stacks.LoadAppConfig(stackDir); appCfg != nil {
|
||
for k, v := range appCfg.Env {
|
||
if secretSet[k] || crypto.IsEncrypted(v) {
|
||
continue
|
||
}
|
||
nonSecret[k] = v
|
||
}
|
||
}
|
||
|
||
return backup.RecoveryInfo{
|
||
StackDir: stackDir,
|
||
DisplayName: s.Meta.DisplayName,
|
||
ImagePins: backup.ParseComposeImages(s.ComposePath),
|
||
NonSecretEnv: nonSecret,
|
||
SecretEnvVars: secretNames,
|
||
DataKeyEnvVars: dataKeys,
|
||
}, true
|
||
}
|
||
|
||
// RecoverStackSecrets returns the live decrypted values for the named secret env vars present in the
|
||
// stack's app.yaml (the guest's own — live rootfs or PBS-restored). Absent/empty names are omitted;
|
||
// the caller's fail-closed gate decides. Secrets come from the guest, never from the recovery unit.
|
||
func (a *stackAdapter) RecoverStackSecrets(name string, names []string) map[string]string {
|
||
s, ok := a.mgr.GetStack(name)
|
||
if !ok {
|
||
return nil
|
||
}
|
||
cfg := stacks.LoadAppConfigDecrypted(filepath.Dir(s.ComposePath), a.encKey)
|
||
if cfg == nil {
|
||
return nil
|
||
}
|
||
out := make(map[string]string)
|
||
for _, n := range names {
|
||
if v, ok := cfg.Env[n]; ok && v != "" {
|
||
out[n] = v
|
||
}
|
||
}
|
||
return out
|
||
}
|
||
|
||
// RecreateStackFromUnit restores the app definition from the unit's compose dir into the stack dir,
|
||
// then redeploys with the reconstructed full env (re-pulling the pinned image). Secrets in fullEnv were
|
||
// recovered from the guest, never regenerated.
|
||
func (a *stackAdapter) RecreateStackFromUnit(name, composeSrcDir string, fullEnv map[string]string) error {
|
||
s, ok := a.mgr.GetStack(name)
|
||
if !ok {
|
||
return fmt.Errorf("stack %q not found", name)
|
||
}
|
||
stackDir := filepath.Dir(s.ComposePath)
|
||
// Recover the app definition from the unit (compose + .felhom.yml) into the stack dir.
|
||
for _, fname := range []string{"docker-compose.yml", ".felhom.yml"} {
|
||
data, err := os.ReadFile(filepath.Join(composeSrcDir, fname))
|
||
if err != nil {
|
||
continue // capture whichever existed
|
||
}
|
||
if err := os.WriteFile(filepath.Join(stackDir, fname), data, 0644); err != nil {
|
||
return fmt.Errorf("restoring %s from unit: %w", fname, err)
|
||
}
|
||
}
|
||
return a.mgr.RedeployFromEnv(name, fullEnv)
|
||
}
|
||
|
||
// RefreshAndIsRunning forces a docker ps scan before checking state.
|
||
// Called during post-restore health check (~every 5s for up to 90s).
|
||
// Full refresh is acceptable here since restores are rare operations.
|
||
func (a *stackAdapter) RefreshAndIsRunning(name string) bool {
|
||
a.mgr.RefreshStatus()
|
||
s, ok := a.mgr.GetStack(name)
|
||
return ok && s.State == stacks.StateRunning
|
||
}
|
||
|
||
// integrationStackAdapter implements integrations.StackProvider using stacks.Manager.
|
||
type integrationStackAdapter struct {
|
||
mgr *stacks.Manager
|
||
}
|
||
|
||
func (a *integrationStackAdapter) GetStack(name string) (*stacks.Stack, bool) {
|
||
return a.mgr.GetStack(name)
|
||
}
|
||
|
||
func (a *integrationStackAdapter) GetStacks() []stacks.Stack {
|
||
return a.mgr.GetStacks()
|
||
}
|
||
|
||
func (a *integrationStackAdapter) RestartStack(name string) error {
|
||
return a.mgr.RestartStack(name)
|
||
}
|
||
|
||
// geoStackAdapter implements cloudflare.StackLister for geo-restriction sync.
|
||
type geoStackAdapter struct {
|
||
mgr *stacks.Manager
|
||
domain string
|
||
}
|
||
|
||
func (a *geoStackAdapter) GetDeployedHostnames() map[string]string {
|
||
result := make(map[string]string)
|
||
for _, stack := range a.mgr.GetStacks() {
|
||
if !stack.Deployed {
|
||
continue
|
||
}
|
||
subdomain := stack.Meta.Subdomain
|
||
// Check for custom subdomain in app.yaml
|
||
if appCfg := a.mgr.LoadAppConfigByName(stack.Name); appCfg != nil {
|
||
if sd, ok := appCfg.Env["SUBDOMAIN"]; ok && sd != "" {
|
||
subdomain = sd
|
||
}
|
||
}
|
||
if subdomain != "" {
|
||
result[stack.Name] = subdomain + "." + a.domain
|
||
}
|
||
}
|
||
return result
|
||
}
|
||
|
||
// exportAdapter implements appexport.ExportStackProvider using stacks.Manager.
|
||
type exportAdapter struct {
|
||
mgr *stacks.Manager
|
||
encKey []byte
|
||
}
|
||
|
||
func (a *exportAdapter) GetStackDir(name string) (string, bool) {
|
||
s, ok := a.mgr.GetStack(name)
|
||
if !ok {
|
||
return "", false
|
||
}
|
||
return filepath.Dir(s.ComposePath), true
|
||
}
|
||
|
||
func (a *exportAdapter) GetStackComposePath(name string) (string, bool) {
|
||
s, ok := a.mgr.GetStack(name)
|
||
if !ok {
|
||
return "", false
|
||
}
|
||
return s.ComposePath, true
|
||
}
|
||
|
||
func (a *exportAdapter) GetStackHDDMounts(name string) []string {
|
||
s, ok := a.mgr.GetStack(name)
|
||
if !ok {
|
||
return nil
|
||
}
|
||
stackDir := filepath.Dir(s.ComposePath)
|
||
appCfg := stacks.LoadAppConfig(stackDir)
|
||
if appCfg != nil && appCfg.Env["HDD_PATH"] != "" {
|
||
return stacks.ParseComposeHDDMounts(s.ComposePath, appCfg.Env["HDD_PATH"])
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func (a *exportAdapter) GetStackHDDPath(name string) string {
|
||
s, ok := a.mgr.GetStack(name)
|
||
if !ok {
|
||
return ""
|
||
}
|
||
stackDir := filepath.Dir(s.ComposePath)
|
||
appCfg := stacks.LoadAppConfig(stackDir)
|
||
if appCfg != nil && appCfg.Env["HDD_PATH"] != "" {
|
||
return filepath.Clean(appCfg.Env["HDD_PATH"])
|
||
}
|
||
return ""
|
||
}
|
||
|
||
func (a *exportAdapter) IsStackRunning(name string) bool {
|
||
s, ok := a.mgr.GetStack(name)
|
||
return ok && s.State == stacks.StateRunning
|
||
}
|
||
|
||
func (a *exportAdapter) StopStack(name string) error {
|
||
return a.mgr.StopStack(name)
|
||
}
|
||
|
||
func (a *exportAdapter) StartStack(name string) error {
|
||
return a.mgr.StartStack(name)
|
||
}
|
||
|
||
func (a *exportAdapter) GetStackDisplayName(name string) string {
|
||
s, ok := a.mgr.GetStack(name)
|
||
if !ok {
|
||
return name
|
||
}
|
||
return s.Meta.DisplayName
|
||
}
|
||
|
||
func (a *exportAdapter) GetStackNeedsHDD(name string) bool {
|
||
s, ok := a.mgr.GetStack(name)
|
||
return ok && s.Meta.Resources.NeedsHDD
|
||
}
|
||
|
||
func (a *exportAdapter) GetDockerVolumes(name string) []string {
|
||
s, ok := a.mgr.GetStack(name)
|
||
if !ok {
|
||
return nil
|
||
}
|
||
return backup.ResolveDockerVolumeNames(s.ComposePath)
|
||
}
|
||
|
||
func (a *exportAdapter) IsStackDeployed(name string) bool {
|
||
s, ok := a.mgr.GetStack(name)
|
||
return ok && s.Deployed
|
||
}
|
||
|
||
func (a *exportAdapter) GetDecryptedEnv(name string) map[string]string {
|
||
s, ok := a.mgr.GetStack(name)
|
||
if !ok {
|
||
return nil
|
||
}
|
||
stackDir := filepath.Dir(s.ComposePath)
|
||
cfg := stacks.LoadAppConfigDecrypted(stackDir, a.encKey)
|
||
if cfg == nil {
|
||
return nil
|
||
}
|
||
return cfg.Env
|
||
}
|
||
|
||
func (a *exportAdapter) GetStacksBaseDir() string {
|
||
return a.mgr.GetStacksBaseDir()
|
||
}
|
||
|
||
func (a *exportAdapter) SaveEncryptedAppConfig(stackDir string, env map[string]string) error {
|
||
meta := stacks.LoadMetadata(stackDir)
|
||
sensitiveVars := stacks.SensitiveEnvVars(&meta)
|
||
cfg := &stacks.AppConfig{
|
||
Deployed: true,
|
||
DeployedAt: time.Now().Format(time.RFC3339),
|
||
Env: env,
|
||
}
|
||
return stacks.SaveAppConfig(stackDir, cfg, a.encKey, sensitiveVars)
|
||
}
|
||
|
||
func (a *exportAdapter) RefreshStacks() error {
|
||
return a.mgr.RefreshStatus()
|
||
}
|
||
|
||
func (a *exportAdapter) RemoveStackVolumes(name string) error {
|
||
s, ok := a.mgr.GetStack(name)
|
||
if !ok {
|
||
return fmt.Errorf("stack %q not found", name)
|
||
}
|
||
stackDir := filepath.Dir(s.ComposePath)
|
||
|
||
// Build env from decrypted app config
|
||
cmdEnv := os.Environ()
|
||
appCfg := stacks.LoadAppConfigDecrypted(stackDir, a.encKey)
|
||
if appCfg != nil {
|
||
for k, v := range appCfg.Env {
|
||
cmdEnv = append(cmdEnv, k+"="+v)
|
||
}
|
||
}
|
||
|
||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||
defer cancel()
|
||
cmd := exec.CommandContext(ctx, "docker", "compose", "down", "--volumes")
|
||
cmd.Dir = stackDir
|
||
cmd.Env = cmdEnv
|
||
out, err := cmd.CombinedOutput()
|
||
if err != nil {
|
||
return fmt.Errorf("compose down --volumes: %s — %w", strings.TrimSpace(string(out)), err)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// fileExists returns true if the path exists (file or directory).
|
||
func fileExists(path string) bool {
|
||
_, err := os.Stat(path)
|
||
return err == nil
|
||
}
|
||
|
||
// quiesceBackend adapts *agentapi.Client to quiesce.Backend (bool/string, decoupled from the
|
||
// agentapi response structs).
|
||
type quiesceBackend struct{ c *agentapi.Client }
|
||
|
||
func (b quiesceBackend) Due(ctx context.Context) (bool, error) {
|
||
r, err := b.c.BackupDue(ctx)
|
||
return r.Due, err
|
||
}
|
||
func (b quiesceBackend) StartBackup(ctx context.Context) (string, error) {
|
||
r, err := b.c.StartBackup(ctx)
|
||
return r.JobID, err
|
||
}
|
||
func (b quiesceBackend) BackupStatus(ctx context.Context) (string, error) {
|
||
r, err := b.c.BackupStatus(ctx)
|
||
return r.Phase, err
|
||
}
|
||
|
||
// startQuiesceLoop wires + starts the slice-8B quiesce loop when the local API is configured and
|
||
// quiesce is enabled. It Recovers (restarts stacks left stopped by a mid-quiesce crash) before
|
||
// starting the loop goroutine. Non-fatal: any misconfig disables the loop with a log line.
|
||
func startQuiesceLoop(ctx context.Context, cfg *config.Config, stackMgr *stacks.Manager, logger *log.Logger) *quiesce.Loop {
|
||
if cfg.LocalAPI.Endpoint == "" || cfg.LocalAPI.Token == "" {
|
||
return nil // not a provisioned guest — no agent to back up against
|
||
}
|
||
if !cfg.Quiesce.QuiesceEnabled() {
|
||
logger.Printf("[INFO] [quiesce] disabled by config")
|
||
return nil
|
||
}
|
||
client, err := agentapi.New(cfg.LocalAPI.Endpoint, cfg.LocalAPI.Token, cfg.LocalAPI.Fingerprint)
|
||
if err != nil {
|
||
logger.Printf("[WARN] [quiesce] disabled (agent client init failed): %v", err)
|
||
return nil
|
||
}
|
||
poll := parseDurationOr(cfg.Quiesce.PollInterval, 5*time.Minute)
|
||
statusPoll := parseDurationOr(cfg.Quiesce.StatusPoll, 10*time.Second)
|
||
maxQuiesce := parseDurationOr(cfg.Quiesce.MaxQuiesce, 30*time.Minute)
|
||
loop := quiesce.New(quiesce.Options{
|
||
Backend: quiesceBackend{c: client},
|
||
Stacks: stackMgr,
|
||
MarkerPath: filepath.Join(cfg.Paths.DataDir, "quiesce-state.json"),
|
||
Poll: poll,
|
||
StatusPoll: statusPoll,
|
||
MaxQuiesce: maxQuiesce,
|
||
Logger: logger,
|
||
})
|
||
loop.Recover() // crash-safety: restart any stacks stranded-down by a mid-quiesce crash
|
||
go loop.Run(ctx)
|
||
return loop
|
||
}
|
||
|
||
// parseDurationOr parses a duration string, falling back to def on empty/invalid input.
|
||
func parseDurationOr(s string, def time.Duration) time.Duration {
|
||
if s == "" {
|
||
return def
|
||
}
|
||
d, err := time.ParseDuration(s)
|
||
if err != nil || d <= 0 {
|
||
return def
|
||
}
|
||
return d
|
||
}
|
||
|
||
// probeLocalAPI proves the controller↔agent local-API channel at startup and logs this guest's
|
||
// mounts (slice 8A). Non-fatal: it only runs when a local-API endpoint is configured, and any
|
||
// error is logged for diagnosis without affecting the controller's boot. The leaf SHA-256 from
|
||
// the bootstrap is pinned by the client (fails closed on mismatch).
|
||
func probeLocalAPI(cfg *config.Config, logger *log.Logger) {
|
||
if cfg.LocalAPI.Endpoint == "" || cfg.LocalAPI.Token == "" {
|
||
return
|
||
}
|
||
client, err := agentapi.New(cfg.LocalAPI.Endpoint, cfg.LocalAPI.Token, cfg.LocalAPI.Fingerprint)
|
||
if err != nil {
|
||
logger.Printf("[WARN] local-api: client init failed (%v) — channel not verified", err)
|
||
return
|
||
}
|
||
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
||
defer cancel()
|
||
resp, err := client.Storage(ctx)
|
||
if err != nil {
|
||
logger.Printf("[WARN] local-api: GET /storage failed (%v) — channel not verified", err)
|
||
return
|
||
}
|
||
logger.Printf("[INFO] local-api: channel up (agent %s) — guest %d, %d mount(s) visible",
|
||
cfg.LocalAPI.Endpoint, resp.VMID, len(resp.Mounts))
|
||
for _, m := range resp.Mounts {
|
||
logger.Printf("[INFO] local-api: mount %s → %s (storage=%s, class=%s, backup=%v)",
|
||
m.Key, m.MountPoint, m.Storage, m.Class, m.Backup)
|
||
}
|
||
}
|
||
|
||
// runSetupMode starts the setup wizard on dual listeners and blocks until signal.
|
||
func runSetupMode(cfg *config.Config, logger *log.Logger) {
|
||
ips := setup.DetectLocalIPs()
|
||
setup.LogSetupMode(cfg.Customer.Domain, ips, cfg.Web.SetupListen, logger)
|
||
|
||
setupSrv := setup.NewServer(cfg, cfg.Paths.DataDir, logger, Version)
|
||
handler := setupSrv.Handler()
|
||
|
||
// Health endpoint wrapper (returns setup_mode: true)
|
||
healthHandler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||
w.Header().Set("Content-Type", "application/json")
|
||
json.NewEncoder(w).Encode(map[string]interface{}{
|
||
"ok": true, "message": "felhom-controller is healthy",
|
||
"setup_mode": true, "version": Version,
|
||
})
|
||
})
|
||
|
||
// Mux for both listeners
|
||
mux := http.NewServeMux()
|
||
mux.HandleFunc("/api/health", healthHandler)
|
||
mux.Handle("/", handler)
|
||
|
||
// Start main listener (:8080, behind Traefik for domain access)
|
||
mainServer := &http.Server{
|
||
Addr: cfg.Web.Listen,
|
||
Handler: mux,
|
||
ReadTimeout: 30 * time.Second,
|
||
WriteTimeout: 60 * time.Second,
|
||
IdleTimeout: 120 * time.Second,
|
||
}
|
||
go func() {
|
||
logger.Printf("[INFO] Setup wizard (main) listening on %s", cfg.Web.Listen)
|
||
if err := mainServer.ListenAndServe(); err != http.ErrServerClosed {
|
||
logger.Printf("[ERROR] Main HTTP server error: %v", err)
|
||
}
|
||
}()
|
||
|
||
// Start setup-only listener (:8081, direct HTTP for LAN access)
|
||
setupServer := &http.Server{
|
||
Addr: cfg.Web.SetupListen,
|
||
Handler: mux,
|
||
ReadTimeout: 30 * time.Second,
|
||
WriteTimeout: 60 * time.Second,
|
||
IdleTimeout: 120 * time.Second,
|
||
}
|
||
go func() {
|
||
logger.Printf("[INFO] Setup wizard (LAN) listening on %s", cfg.Web.SetupListen)
|
||
if err := setupServer.ListenAndServe(); err != http.ErrServerClosed {
|
||
logger.Printf("[ERROR] Setup HTTP server error: %v", err)
|
||
}
|
||
}()
|
||
|
||
// Wait for signal
|
||
sigCh := make(chan os.Signal, 1)
|
||
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
|
||
sig := <-sigCh
|
||
logger.Printf("[INFO] Received signal %v, shutting down setup wizard...", sig)
|
||
|
||
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||
defer shutdownCancel()
|
||
mainServer.Shutdown(shutdownCtx)
|
||
setupServer.Shutdown(shutdownCtx)
|
||
logger.Println("[INFO] Setup wizard stopped")
|
||
}
|
||
|
||
// discoverHDDPaths scans deployed apps' app.yaml for HDD_PATH env values.
|
||
func discoverHDDPaths(stacksDir string, logger *log.Logger) []string {
|
||
entries, err := os.ReadDir(stacksDir)
|
||
if err != nil {
|
||
logger.Printf("[WARN] Cannot read stacks dir for HDD path discovery: %v", err)
|
||
return nil
|
||
}
|
||
seen := make(map[string]bool)
|
||
var paths []string
|
||
for _, e := range entries {
|
||
if !e.IsDir() {
|
||
continue
|
||
}
|
||
appCfg := stacks.LoadAppConfig(filepath.Join(stacksDir, e.Name()))
|
||
if appCfg == nil || !appCfg.Deployed {
|
||
continue
|
||
}
|
||
if hddPath, ok := appCfg.Env["HDD_PATH"]; ok && hddPath != "" {
|
||
cleaned := filepath.Clean(hddPath)
|
||
if !seen[cleaned] {
|
||
seen[cleaned] = true
|
||
paths = append(paths, cleaned)
|
||
}
|
||
}
|
||
}
|
||
return paths
|
||
}
|