Files
deploy-felhom-compose/controller/cmd/controller/main.go
T
admin 97074e7a0c v0.6.0: healthcheck + hub reporting implementation
- Add heartbeat ping (every 5 min, controller alive signal)
- Add backup integrity check (weekly restic check, Sunday 04:00)
- Add Heartbeat + BackupIntegrity fields to PingUUIDsConfig
- Add HubConfig for central hub reporting
- Add report package (types, builder, pusher) for hub push
- Wire hub reporting into scheduler (configurable interval)
- Update controller.yaml.example with new monitoring + hub sections
- Add monitoring/DEPRECATED.md for legacy bash scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 13:19:08 +01:00

268 lines
8.2 KiB
Go

package main
import (
"context"
"flag"
"fmt"
"log"
"net/http"
"os"
"os/signal"
"syscall"
"time"
"gitea.dooplex.hu/admin/felhom-controller/internal/api"
"gitea.dooplex.hu/admin/felhom-controller/internal/backup"
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
"gitea.dooplex.hu/admin/felhom-controller/internal/metrics"
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
"gitea.dooplex.hu/admin/felhom-controller/internal/report"
"gitea.dooplex.hu/admin/felhom-controller/internal/scheduler"
"gitea.dooplex.hu/admin/felhom-controller/internal/stacks"
catalogsync "gitea.dooplex.hu/admin/felhom-controller/internal/sync"
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
"gitea.dooplex.hu/admin/felhom-controller/internal/web"
)
var (
// Set at build time via ldflags
Version = "dev"
BuildTime = "unknown"
GitCommit = "unknown"
)
func main() {
configPath := flag.String("config", "/opt/docker/felhom-controller/controller.yaml", "Path to configuration file")
showVersion := flag.Bool("version", false, "Show version and exit")
flag.Parse()
if *showVersion {
fmt.Printf("felhom-controller %s (built %s, commit %s)\n", Version, BuildTime, GitCommit)
os.Exit(0)
}
// --- Load configuration ---
cfg, err := config.Load(*configPath)
if err != nil {
log.Fatalf("[FATAL] Failed to load config from %s: %v", *configPath, err)
}
logger := setupLogger(cfg)
logger.Printf("[INFO] felhom-controller %s starting (customer: %s, domain: %s)",
Version, cfg.Customer.ID, cfg.Customer.Domain)
// --- Initialize stack manager ---
stackMgr, err := stacks.NewManager(cfg, logger)
if err != nil {
logger.Fatalf("[FATAL] Failed to initialize stack manager: %v", err)
}
// Initial stack scan
if err := stackMgr.ScanStacks(); err != nil {
logger.Printf("[WARN] Initial stack scan failed: %v", err)
}
// --- Initialize catalog syncer ---
syncer := catalogsync.New(cfg, logger, stackMgr.ScanStacks)
syncer.Start()
defer syncer.Stop()
// --- Graceful shutdown context ---
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// --- Start CPU collector ---
cpuCollector := system.NewCPUCollector(5 * time.Second)
cpuCollector.Start(ctx)
defer cpuCollector.Stop()
// --- Initialize metrics store + collector ---
metricsDBPath := "/opt/docker/felhom-controller/data/metrics.db"
metricsStore, err := metrics.NewMetricsStore(metricsDBPath, logger)
if err != nil {
logger.Printf("[WARN] Failed to initialize metrics store: %v — monitoring disabled", err)
} else {
logger.Printf("[INFO] Metrics store opened at %s", metricsDBPath)
}
if metricsStore != nil {
defer metricsStore.Close()
metricsCollector := metrics.NewMetricsCollector(metricsStore, cpuCollector, cfg.Paths.HDDPath, logger)
metricsCollector.Start(ctx)
defer metricsCollector.Stop()
logger.Println("[INFO] Metrics collector started (60s interval)")
}
// --- Initialize health pinger ---
pinger := monitor.NewPinger(&cfg.Monitoring, logger)
// --- Initialize backup manager ---
var backupMgr *backup.Manager
if cfg.Backup.Enabled {
backupMgr = backup.NewManager(cfg, pinger, logger)
backupMgr.AfterBackup = func() {
nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule)
nextBackup := scheduler.NextDailyRun(cfg.Backup.ResticSchedule)
backupMgr.RefreshCache(nextDBDump, nextBackup)
}
go backupMgr.LoadSnapshotHistory()
}
// --- Initialize scheduler ---
sched := scheduler.New(logger)
// Existing periodic tasks (migrated from ad-hoc goroutines)
sched.Every("status-refresh", 30*time.Second, func(ctx context.Context) error {
return stackMgr.RefreshStatus()
})
sched.Every("stack-scan", 2*time.Minute, func(ctx context.Context) error {
return stackMgr.ScanStacks()
})
// Heartbeat — lightweight "I'm alive" signal
sched.Every("heartbeat", 5*time.Minute, func(ctx context.Context) error {
pinger.Ping(cfg.Monitoring.PingUUIDs.Heartbeat, "")
return nil
})
// System health ping
healthInterval, err := time.ParseDuration(cfg.Monitoring.SystemHealthInterval)
if err != nil {
healthInterval = 5 * time.Minute
}
sched.Every("system-health", healthInterval, func(ctx context.Context) error {
healthReport := monitor.RunHealthCheck(cfg, cpuCollector)
body := healthReport.FormatMessage()
healthUUID := cfg.Monitoring.PingUUIDs.SystemHealth
if healthReport.Status == "fail" {
pinger.Fail(healthUUID, body)
} else {
pinger.Ping(healthUUID, body)
}
return nil
})
// Backup daily jobs
if cfg.Backup.Enabled && backupMgr != nil {
sched.Daily("db-dump", cfg.Backup.DBDumpSchedule, func(ctx context.Context) error {
return backupMgr.RunDBDumps(ctx)
})
sched.Daily("backup", cfg.Backup.ResticSchedule, func(ctx context.Context) error {
return backupMgr.RunBackup(ctx)
})
// Weekly integrity check — Sunday 04:00
sched.Daily("backup-integrity", "04:00", func(ctx context.Context) error {
if time.Now().Weekday() != time.Sunday {
return nil
}
return backupMgr.RunIntegrityCheck(ctx)
})
// Cache refresh: every 5 minutes
sched.Every("backup-cache", 5*time.Minute, func(ctx context.Context) error {
nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule)
nextBackup := scheduler.NextDailyRun(cfg.Backup.ResticSchedule)
backupMgr.RefreshCache(nextDBDump, nextBackup)
return nil
})
}
// Metrics prune — daily at 04:00
if metricsStore != nil {
sched.Daily("metrics-prune", "04:00", func(ctx context.Context) error {
deleted, err := metricsStore.Prune(30 * 24 * time.Hour)
if err != nil {
return err
}
logger.Printf("[INFO] Pruned %d old metric rows", deleted)
return nil
})
}
// --- Central hub reporting ---
if cfg.Hub.Enabled && cfg.Hub.URL != "" {
pushInterval, err := time.ParseDuration(cfg.Hub.PushInterval)
if err != nil {
pushInterval = 15 * time.Minute
}
pusher := report.NewPusher(&cfg.Hub, logger)
sched.Every("hub-report", pushInterval, func(ctx context.Context) error {
r := report.BuildReport(cfg, stackMgr, backupMgr, cpuCollector, metricsStore, Version)
return pusher.Push(r)
})
logger.Printf("[INFO] Hub reporting enabled (every %s to %s)", pushInterval, cfg.Hub.URL)
}
sched.Start(ctx)
defer sched.Stop()
// Initial backup cache population (don't block startup)
if cfg.Backup.Enabled && backupMgr != nil {
go func() {
nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule)
nextBackup := scheduler.NextDailyRun(cfg.Backup.ResticSchedule)
backupMgr.RefreshCache(nextDBDump, nextBackup)
}()
}
// --- Initialize API router ---
apiRouter := api.NewRouter(cfg, stackMgr, syncer, cpuCollector, backupMgr, metricsStore, logger)
// --- Initialize web server ---
webServer := web.NewServer(cfg, stackMgr, cpuCollector, backupMgr, sched, logger, Version)
// --- Build HTTP mux ---
mux := http.NewServeMux()
// API routes (no auth for health endpoint, auth for everything else)
mux.HandleFunc("/api/health", apiRouter.HealthHandler)
mux.Handle("/api/", webServer.RequireAuth(http.HandlerFunc(apiRouter.ServeHTTP)))
// Web UI routes (auth required)
mux.Handle("/", webServer.RequireAuth(http.HandlerFunc(webServer.ServeHTTP)))
// --- Start HTTP server ---
server := &http.Server{
Addr: cfg.Web.Listen,
Handler: mux,
ReadTimeout: 30 * time.Second,
WriteTimeout: 60 * time.Second,
IdleTimeout: 120 * time.Second,
}
sigCh := make(chan os.Signal, 1)
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
go func() {
sig := <-sigCh
logger.Printf("[INFO] Received signal %v, shutting down...", sig)
cancel()
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 15*time.Second)
defer shutdownCancel()
if err := server.Shutdown(shutdownCtx); err != nil {
logger.Printf("[ERROR] HTTP server shutdown error: %v", err)
}
}()
logger.Printf("[INFO] Web UI listening on %s", cfg.Web.Listen)
if err := server.ListenAndServe(); err != http.ErrServerClosed {
logger.Fatalf("[FATAL] HTTP server error: %v", err)
}
logger.Println("[INFO] felhom-controller stopped")
}
func setupLogger(cfg *config.Config) *log.Logger {
// For now, log to stdout. File logging will be added later.
logger := log.New(os.Stdout, "", log.LstdFlags)
if cfg.Logging.Level == "debug" {
logger.SetFlags(log.LstdFlags | log.Lshortfile)
}
return logger
}