97074e7a0c
- Add heartbeat ping (every 5 min, controller alive signal) - Add backup integrity check (weekly restic check, Sunday 04:00) - Add Heartbeat + BackupIntegrity fields to PingUUIDsConfig - Add HubConfig for central hub reporting - Add report package (types, builder, pusher) for hub push - Wire hub reporting into scheduler (configurable interval) - Update controller.yaml.example with new monitoring + hub sections - Add monitoring/DEPRECATED.md for legacy bash scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
268 lines
8.2 KiB
Go
268 lines
8.2 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"flag"
|
|
"fmt"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"os/signal"
|
|
"syscall"
|
|
"time"
|
|
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/api"
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/backup"
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/metrics"
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/report"
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/scheduler"
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/stacks"
|
|
catalogsync "gitea.dooplex.hu/admin/felhom-controller/internal/sync"
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/web"
|
|
)
|
|
|
|
var (
|
|
// Set at build time via ldflags
|
|
Version = "dev"
|
|
BuildTime = "unknown"
|
|
GitCommit = "unknown"
|
|
)
|
|
|
|
func main() {
|
|
configPath := flag.String("config", "/opt/docker/felhom-controller/controller.yaml", "Path to configuration file")
|
|
showVersion := flag.Bool("version", false, "Show version and exit")
|
|
flag.Parse()
|
|
|
|
if *showVersion {
|
|
fmt.Printf("felhom-controller %s (built %s, commit %s)\n", Version, BuildTime, GitCommit)
|
|
os.Exit(0)
|
|
}
|
|
|
|
// --- Load configuration ---
|
|
cfg, err := config.Load(*configPath)
|
|
if err != nil {
|
|
log.Fatalf("[FATAL] Failed to load config from %s: %v", *configPath, err)
|
|
}
|
|
|
|
logger := setupLogger(cfg)
|
|
logger.Printf("[INFO] felhom-controller %s starting (customer: %s, domain: %s)",
|
|
Version, cfg.Customer.ID, cfg.Customer.Domain)
|
|
|
|
// --- Initialize stack manager ---
|
|
stackMgr, err := stacks.NewManager(cfg, logger)
|
|
if err != nil {
|
|
logger.Fatalf("[FATAL] Failed to initialize stack manager: %v", err)
|
|
}
|
|
|
|
// Initial stack scan
|
|
if err := stackMgr.ScanStacks(); err != nil {
|
|
logger.Printf("[WARN] Initial stack scan failed: %v", err)
|
|
}
|
|
|
|
// --- Initialize catalog syncer ---
|
|
syncer := catalogsync.New(cfg, logger, stackMgr.ScanStacks)
|
|
syncer.Start()
|
|
defer syncer.Stop()
|
|
|
|
// --- Graceful shutdown context ---
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
defer cancel()
|
|
|
|
// --- Start CPU collector ---
|
|
cpuCollector := system.NewCPUCollector(5 * time.Second)
|
|
cpuCollector.Start(ctx)
|
|
defer cpuCollector.Stop()
|
|
|
|
// --- Initialize metrics store + collector ---
|
|
metricsDBPath := "/opt/docker/felhom-controller/data/metrics.db"
|
|
metricsStore, err := metrics.NewMetricsStore(metricsDBPath, logger)
|
|
if err != nil {
|
|
logger.Printf("[WARN] Failed to initialize metrics store: %v — monitoring disabled", err)
|
|
} else {
|
|
logger.Printf("[INFO] Metrics store opened at %s", metricsDBPath)
|
|
}
|
|
|
|
if metricsStore != nil {
|
|
defer metricsStore.Close()
|
|
metricsCollector := metrics.NewMetricsCollector(metricsStore, cpuCollector, cfg.Paths.HDDPath, logger)
|
|
metricsCollector.Start(ctx)
|
|
defer metricsCollector.Stop()
|
|
logger.Println("[INFO] Metrics collector started (60s interval)")
|
|
}
|
|
|
|
// --- Initialize health pinger ---
|
|
pinger := monitor.NewPinger(&cfg.Monitoring, logger)
|
|
|
|
// --- Initialize backup manager ---
|
|
var backupMgr *backup.Manager
|
|
if cfg.Backup.Enabled {
|
|
backupMgr = backup.NewManager(cfg, pinger, logger)
|
|
backupMgr.AfterBackup = func() {
|
|
nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule)
|
|
nextBackup := scheduler.NextDailyRun(cfg.Backup.ResticSchedule)
|
|
backupMgr.RefreshCache(nextDBDump, nextBackup)
|
|
}
|
|
go backupMgr.LoadSnapshotHistory()
|
|
}
|
|
|
|
// --- Initialize scheduler ---
|
|
sched := scheduler.New(logger)
|
|
|
|
// Existing periodic tasks (migrated from ad-hoc goroutines)
|
|
sched.Every("status-refresh", 30*time.Second, func(ctx context.Context) error {
|
|
return stackMgr.RefreshStatus()
|
|
})
|
|
sched.Every("stack-scan", 2*time.Minute, func(ctx context.Context) error {
|
|
return stackMgr.ScanStacks()
|
|
})
|
|
|
|
// Heartbeat — lightweight "I'm alive" signal
|
|
sched.Every("heartbeat", 5*time.Minute, func(ctx context.Context) error {
|
|
pinger.Ping(cfg.Monitoring.PingUUIDs.Heartbeat, "")
|
|
return nil
|
|
})
|
|
|
|
// System health ping
|
|
healthInterval, err := time.ParseDuration(cfg.Monitoring.SystemHealthInterval)
|
|
if err != nil {
|
|
healthInterval = 5 * time.Minute
|
|
}
|
|
sched.Every("system-health", healthInterval, func(ctx context.Context) error {
|
|
healthReport := monitor.RunHealthCheck(cfg, cpuCollector)
|
|
body := healthReport.FormatMessage()
|
|
healthUUID := cfg.Monitoring.PingUUIDs.SystemHealth
|
|
if healthReport.Status == "fail" {
|
|
pinger.Fail(healthUUID, body)
|
|
} else {
|
|
pinger.Ping(healthUUID, body)
|
|
}
|
|
return nil
|
|
})
|
|
|
|
// Backup daily jobs
|
|
if cfg.Backup.Enabled && backupMgr != nil {
|
|
sched.Daily("db-dump", cfg.Backup.DBDumpSchedule, func(ctx context.Context) error {
|
|
return backupMgr.RunDBDumps(ctx)
|
|
})
|
|
sched.Daily("backup", cfg.Backup.ResticSchedule, func(ctx context.Context) error {
|
|
return backupMgr.RunBackup(ctx)
|
|
})
|
|
|
|
// Weekly integrity check — Sunday 04:00
|
|
sched.Daily("backup-integrity", "04:00", func(ctx context.Context) error {
|
|
if time.Now().Weekday() != time.Sunday {
|
|
return nil
|
|
}
|
|
return backupMgr.RunIntegrityCheck(ctx)
|
|
})
|
|
|
|
// Cache refresh: every 5 minutes
|
|
sched.Every("backup-cache", 5*time.Minute, func(ctx context.Context) error {
|
|
nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule)
|
|
nextBackup := scheduler.NextDailyRun(cfg.Backup.ResticSchedule)
|
|
backupMgr.RefreshCache(nextDBDump, nextBackup)
|
|
return nil
|
|
})
|
|
}
|
|
|
|
// Metrics prune — daily at 04:00
|
|
if metricsStore != nil {
|
|
sched.Daily("metrics-prune", "04:00", func(ctx context.Context) error {
|
|
deleted, err := metricsStore.Prune(30 * 24 * time.Hour)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
logger.Printf("[INFO] Pruned %d old metric rows", deleted)
|
|
return nil
|
|
})
|
|
}
|
|
|
|
// --- Central hub reporting ---
|
|
if cfg.Hub.Enabled && cfg.Hub.URL != "" {
|
|
pushInterval, err := time.ParseDuration(cfg.Hub.PushInterval)
|
|
if err != nil {
|
|
pushInterval = 15 * time.Minute
|
|
}
|
|
pusher := report.NewPusher(&cfg.Hub, logger)
|
|
sched.Every("hub-report", pushInterval, func(ctx context.Context) error {
|
|
r := report.BuildReport(cfg, stackMgr, backupMgr, cpuCollector, metricsStore, Version)
|
|
return pusher.Push(r)
|
|
})
|
|
logger.Printf("[INFO] Hub reporting enabled (every %s to %s)", pushInterval, cfg.Hub.URL)
|
|
}
|
|
|
|
sched.Start(ctx)
|
|
defer sched.Stop()
|
|
|
|
// Initial backup cache population (don't block startup)
|
|
if cfg.Backup.Enabled && backupMgr != nil {
|
|
go func() {
|
|
nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule)
|
|
nextBackup := scheduler.NextDailyRun(cfg.Backup.ResticSchedule)
|
|
backupMgr.RefreshCache(nextDBDump, nextBackup)
|
|
}()
|
|
}
|
|
|
|
// --- Initialize API router ---
|
|
apiRouter := api.NewRouter(cfg, stackMgr, syncer, cpuCollector, backupMgr, metricsStore, logger)
|
|
|
|
// --- Initialize web server ---
|
|
webServer := web.NewServer(cfg, stackMgr, cpuCollector, backupMgr, sched, logger, Version)
|
|
|
|
// --- Build HTTP mux ---
|
|
mux := http.NewServeMux()
|
|
|
|
// API routes (no auth for health endpoint, auth for everything else)
|
|
mux.HandleFunc("/api/health", apiRouter.HealthHandler)
|
|
mux.Handle("/api/", webServer.RequireAuth(http.HandlerFunc(apiRouter.ServeHTTP)))
|
|
|
|
// Web UI routes (auth required)
|
|
mux.Handle("/", webServer.RequireAuth(http.HandlerFunc(webServer.ServeHTTP)))
|
|
|
|
// --- Start HTTP server ---
|
|
server := &http.Server{
|
|
Addr: cfg.Web.Listen,
|
|
Handler: mux,
|
|
ReadTimeout: 30 * time.Second,
|
|
WriteTimeout: 60 * time.Second,
|
|
IdleTimeout: 120 * time.Second,
|
|
}
|
|
|
|
sigCh := make(chan os.Signal, 1)
|
|
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
|
|
|
|
go func() {
|
|
sig := <-sigCh
|
|
logger.Printf("[INFO] Received signal %v, shutting down...", sig)
|
|
cancel()
|
|
|
|
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 15*time.Second)
|
|
defer shutdownCancel()
|
|
|
|
if err := server.Shutdown(shutdownCtx); err != nil {
|
|
logger.Printf("[ERROR] HTTP server shutdown error: %v", err)
|
|
}
|
|
}()
|
|
|
|
logger.Printf("[INFO] Web UI listening on %s", cfg.Web.Listen)
|
|
if err := server.ListenAndServe(); err != http.ErrServerClosed {
|
|
logger.Fatalf("[FATAL] HTTP server error: %v", err)
|
|
}
|
|
|
|
logger.Println("[INFO] felhom-controller stopped")
|
|
}
|
|
|
|
func setupLogger(cfg *config.Config) *log.Logger {
|
|
// For now, log to stdout. File logging will be added later.
|
|
logger := log.New(os.Stdout, "", log.LstdFlags)
|
|
|
|
if cfg.Logging.Level == "debug" {
|
|
logger.SetFlags(log.LstdFlags | log.Lshortfile)
|
|
}
|
|
|
|
return logger
|
|
}
|