diff --git a/controller/cmd/controller/main.go b/controller/cmd/controller/main.go index 59cbfca..dbc91f1 100644 --- a/controller/cmd/controller/main.go +++ b/controller/cmd/controller/main.go @@ -16,6 +16,7 @@ import ( "gitea.dooplex.hu/admin/felhom-controller/internal/config" "gitea.dooplex.hu/admin/felhom-controller/internal/metrics" "gitea.dooplex.hu/admin/felhom-controller/internal/monitor" + "gitea.dooplex.hu/admin/felhom-controller/internal/report" "gitea.dooplex.hu/admin/felhom-controller/internal/scheduler" "gitea.dooplex.hu/admin/felhom-controller/internal/stacks" catalogsync "gitea.dooplex.hu/admin/felhom-controller/internal/sync" @@ -118,16 +119,22 @@ func main() { return stackMgr.ScanStacks() }) + // Heartbeat — lightweight "I'm alive" signal + sched.Every("heartbeat", 5*time.Minute, func(ctx context.Context) error { + pinger.Ping(cfg.Monitoring.PingUUIDs.Heartbeat, "") + return nil + }) + // System health ping healthInterval, err := time.ParseDuration(cfg.Monitoring.SystemHealthInterval) if err != nil { healthInterval = 5 * time.Minute } sched.Every("system-health", healthInterval, func(ctx context.Context) error { - report := monitor.RunHealthCheck(cfg, cpuCollector) - body := report.FormatMessage() + healthReport := monitor.RunHealthCheck(cfg, cpuCollector) + body := healthReport.FormatMessage() healthUUID := cfg.Monitoring.PingUUIDs.SystemHealth - if report.Status == "fail" { + if healthReport.Status == "fail" { pinger.Fail(healthUUID, body) } else { pinger.Ping(healthUUID, body) @@ -144,6 +151,14 @@ func main() { return backupMgr.RunBackup(ctx) }) + // Weekly integrity check — Sunday 04:00 + sched.Daily("backup-integrity", "04:00", func(ctx context.Context) error { + if time.Now().Weekday() != time.Sunday { + return nil + } + return backupMgr.RunIntegrityCheck(ctx) + }) + // Cache refresh: every 5 minutes sched.Every("backup-cache", 5*time.Minute, func(ctx context.Context) error { nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule) @@ -165,6 +180,20 @@ func main() { }) } + // --- Central hub reporting --- + if cfg.Hub.Enabled && cfg.Hub.URL != "" { + pushInterval, err := time.ParseDuration(cfg.Hub.PushInterval) + if err != nil { + pushInterval = 15 * time.Minute + } + pusher := report.NewPusher(&cfg.Hub, logger) + sched.Every("hub-report", pushInterval, func(ctx context.Context) error { + r := report.BuildReport(cfg, stackMgr, backupMgr, cpuCollector, metricsStore, Version) + return pusher.Push(r) + }) + logger.Printf("[INFO] Hub reporting enabled (every %s to %s)", pushInterval, cfg.Hub.URL) + } + sched.Start(ctx) defer sched.Stop() diff --git a/controller/configs/controller.yaml.example b/controller/configs/controller.yaml.example index 7f006f5..4b85833 100644 --- a/controller/configs/controller.yaml.example +++ b/controller/configs/controller.yaml.example @@ -80,9 +80,11 @@ monitoring: enabled: true healthchecks_base: "https://status.felhom.eu" ping_uuids: - db_dump: "CHANGEME-uuid-for-db-dump" - backup: "CHANGEME-uuid-for-backup" - system_health: "CHANGEME-uuid-for-system-health" + heartbeat: "" # Every 5 min — controller process alive + system_health: "CHANGEME-uuid-for-system-health" # Every 5 min — comprehensive system check + db_dump: "CHANGEME-uuid-for-db-dump" # Daily — after database dumps + backup: "CHANGEME-uuid-for-backup" # Daily — after restic snapshot + backup_integrity: "" # Weekly (Sunday) — restic check system_health_interval: "5m" health_check_schedule: "06:00" thresholds: @@ -93,6 +95,13 @@ monitoring: memory_warn_percent: 85 temperature_warn_celsius: 75 +# --- Central hub (operator dashboard) --- +hub: + enabled: false # Enable central reporting + url: "https://hub.felhom.eu" # Hub API endpoint + api_key: "" # Shared secret for authentication + push_interval: "15m" # How often to push reports + # --- Self-update --- self_update: enabled: true diff --git a/controller/internal/backup/backup.go b/controller/internal/backup/backup.go index 98eec88..c8632c9 100644 --- a/controller/internal/backup/backup.go +++ b/controller/internal/backup/backup.go @@ -270,6 +270,37 @@ func (m *Manager) RunBackup(ctx context.Context) error { return nil } +// RunIntegrityCheck runs restic check and pings healthchecks with the result. +func (m *Manager) RunIntegrityCheck(ctx context.Context) error { + m.logger.Printf("[INFO] Starting restic integrity check") + start := time.Now() + + if err := m.restic.EnsureInitialized(); err != nil { + m.logger.Printf("[ERROR] Restic init failed for integrity check: %v", err) + return err + } + + err := m.restic.Check() + duration := time.Since(start) + + uuid := m.cfg.Monitoring.PingUUIDs.BackupIntegrity + + m.mu.Lock() + m.lastCheckTime = time.Now() + m.lastCheckOK = err == nil + m.mu.Unlock() + + if err != nil { + m.logger.Printf("[ERROR] Restic integrity check failed (%s): %v", duration.Round(time.Second), err) + m.pinger.Fail(uuid, fmt.Sprintf("restic check failed: %v", err)) + return err + } + + m.logger.Printf("[INFO] Restic integrity check passed (%s)", duration.Round(time.Second)) + m.pinger.Ping(uuid, fmt.Sprintf("restic check passed (%s)", duration.Round(time.Second))) + return nil +} + // RunFullBackup runs DB dumps followed by restic backup. func (m *Manager) RunFullBackup(ctx context.Context) error { m.mu.Lock() diff --git a/controller/internal/config/config.go b/controller/internal/config/config.go index 3a302f4..1b2849b 100644 --- a/controller/internal/config/config.go +++ b/controller/internal/config/config.go @@ -20,6 +20,7 @@ type Config struct { Stacks StacksConfig `yaml:"stacks"` Backup BackupConfig `yaml:"backup"` Monitoring MonitoringConfig `yaml:"monitoring"` + Hub HubConfig `yaml:"hub"` SelfUpdate SelfUpdateConfig `yaml:"self_update"` Notifications NotificationsConfig `yaml:"notifications"` Logging LoggingConfig `yaml:"logging"` @@ -98,9 +99,11 @@ type MonitoringConfig struct { } type PingUUIDsConfig struct { - DBDump string `yaml:"db_dump"` - Backup string `yaml:"backup"` - SystemHealth string `yaml:"system_health"` + Heartbeat string `yaml:"heartbeat"` + DBDump string `yaml:"db_dump"` + Backup string `yaml:"backup"` + SystemHealth string `yaml:"system_health"` + BackupIntegrity string `yaml:"backup_integrity"` } type ThresholdsConfig struct { @@ -136,6 +139,13 @@ type AssetsConfig struct { SourceURL string `yaml:"source_url"` // Only used during build, not runtime } +type HubConfig struct { + Enabled bool `yaml:"enabled"` + URL string `yaml:"url"` + APIKey string `yaml:"api_key"` + PushInterval string `yaml:"push_interval"` +} + // Load reads and parses the config file, applies defaults, and validates. func Load(path string) (*Config, error) { data, err := os.ReadFile(path) @@ -198,6 +208,7 @@ func applyDefaults(cfg *Config) { di(&cfg.Monitoring.Thresholds.CPUWarnPercent, 90) di(&cfg.Monitoring.Thresholds.MemoryWarnPercent, 85) di(&cfg.Monitoring.Thresholds.TemperatureWarnCelsius, 75) + d(&cfg.Hub.PushInterval, "15m") d(&cfg.SelfUpdate.CheckInterval, "6h") di(&cfg.SelfUpdate.HealthTimeoutSeconds, 60) d(&cfg.Logging.Level, "info") diff --git a/controller/internal/report/builder.go b/controller/internal/report/builder.go new file mode 100644 index 0000000..367a787 --- /dev/null +++ b/controller/internal/report/builder.go @@ -0,0 +1,230 @@ +package report + +import ( + "strconv" + "strings" + "time" + + "gitea.dooplex.hu/admin/felhom-controller/internal/backup" + "gitea.dooplex.hu/admin/felhom-controller/internal/config" + "gitea.dooplex.hu/admin/felhom-controller/internal/metrics" + "gitea.dooplex.hu/admin/felhom-controller/internal/monitor" + "gitea.dooplex.hu/admin/felhom-controller/internal/scheduler" + "gitea.dooplex.hu/admin/felhom-controller/internal/stacks" + "gitea.dooplex.hu/admin/felhom-controller/internal/system" +) + +// BuildReport collects current state from all subsystems and returns a Report. +func BuildReport( + cfg *config.Config, + stackMgr *stacks.Manager, + backupMgr *backup.Manager, + cpuCollector *system.CPUCollector, + metricsStore *metrics.MetricsStore, + version string, +) *Report { + r := &Report{ + Version: 1, + CustomerID: cfg.Customer.ID, + CustomerName: cfg.Customer.Name, + ControllerVersion: version, + Timestamp: time.Now().UTC(), + } + + // System info + staticInfo := metrics.GetStaticInfo() + sysInfo := system.GetInfo(cfg.Paths.HDDPath, cpuCollector) + + r.System = SystemReport{ + Hostname: staticInfo.Hostname, + OS: staticInfo.OS, + Kernel: staticInfo.Kernel, + CPUModel: staticInfo.CPUModel, + CPUCores: staticInfo.CPUCores, + UptimeSeconds: staticInfo.UptimeSeconds, + CPUPercent: sysInfo.CPUPercent, + MemoryTotalMB: sysInfo.TotalMemMB, + MemoryUsedMB: sysInfo.UsedMemMB, + MemoryPercent: sysInfo.MemPercent, + TemperatureCelsius: sysInfo.TemperatureCelsius, + LoadAvg1: sysInfo.LoadAvg1, + LoadAvg5: sysInfo.LoadAvg5, + LoadAvg15: sysInfo.LoadAvg15, + } + + // Storage + r.Storage = []StorageReport{ + {Mount: "/", TotalGB: sysInfo.DiskTotalGB, UsedGB: sysInfo.DiskUsedGB, Percent: sysInfo.DiskPercent}, + } + if sysInfo.HDDConfigured { + r.Storage = append(r.Storage, StorageReport{ + Mount: cfg.Paths.HDDPath, + TotalGB: sysInfo.HDDTotalGB, + UsedGB: sysInfo.HDDUsedGB, + Percent: sysInfo.HDDPercent, + }) + } + + // Containers + r.Containers = buildContainerReport(stackMgr, metricsStore) + + // Backup + r.Backup = buildBackupReport(cfg, backupMgr) + + // Health + healthReport := monitor.RunHealthCheck(cfg, cpuCollector) + r.Health = HealthReport{ + Status: healthReport.Status, + Issues: healthReport.Issues, + Warnings: healthReport.Warnings, + } + if r.Health.Issues == nil { + r.Health.Issues = []string{} + } + if r.Health.Warnings == nil { + r.Health.Warnings = []string{} + } + + // Stacks + r.Stacks = buildStacksReport(stackMgr) + + return r +} + +func buildContainerReport(stackMgr *stacks.Manager, metricsStore *metrics.MetricsStore) ContainerReport { + cr := ContainerReport{} + + allStacks := stackMgr.GetStacks() + + // Build a map of container stats from metrics store + statsMap := make(map[string]metrics.ContainerCurrentStats) + if metricsStore != nil { + if stats, err := metricsStore.QueryContainerSummary(); err == nil { + for _, s := range stats { + statsMap[s.ContainerName] = s + } + } + } + + for _, s := range allStacks { + if !s.Deployed { + continue + } + for _, c := range s.Containers { + cr.Total++ + switch c.State { + case stacks.StateRunning, stacks.StateStarting: + cr.Running++ + case stacks.StateUnhealthy: + cr.Unhealthy++ + cr.Running++ // unhealthy containers are still running + default: + cr.Stopped++ + } + + detail := ContainerDetailReport{ + Name: c.Name, + State: string(c.State), + } + if cs, ok := statsMap[c.Name]; ok { + detail.CPUPercent = cs.CPUPercent + detail.MemoryMB = cs.MemUsageMB + } + cr.List = append(cr.List, detail) + } + } + + if cr.List == nil { + cr.List = []ContainerDetailReport{} + } + + return cr +} + +func buildBackupReport(cfg *config.Config, backupMgr *backup.Manager) BackupReport { + br := BackupReport{ + Enabled: cfg.Backup.Enabled, + } + + if backupMgr == nil { + return br + } + + nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule) + nextBackup := scheduler.NextDailyRun(cfg.Backup.ResticSchedule) + status := backupMgr.GetFullStatus(nextDBDump, nextBackup) + + if status.LastDBDump != nil { + t := status.LastDBDump.LastRun + br.LastDBDump = &t + } + if status.LastBackup != nil { + t := status.LastBackup.LastRun + br.LastSnapshot = &t + } + if status.RepoStats != nil { + br.SnapshotCount = status.RepoStats.SnapshotCount + br.RepoSizeMB = parseSizeToMB(status.RepoStats.TotalSize) + } + if !status.LastCheckTime.IsZero() { + t := status.LastCheckTime + br.LastIntegrityCheck = &t + } + br.IntegrityOK = status.LastCheckOK + + return br +} + +func buildStacksReport(stackMgr *stacks.Manager) StacksReport { + sr := StacksReport{} + allStacks := stackMgr.GetStacks() + + for _, s := range allStacks { + if s.Protected { + continue + } + if s.Deployed { + sr.Deployed = append(sr.Deployed, s.Name) + } else { + sr.Available = append(sr.Available, s.Name) + } + } + + if sr.Deployed == nil { + sr.Deployed = []string{} + } + if sr.Available == nil { + sr.Available = []string{} + } + + return sr +} + +// parseSizeToMB parses a formatted size string like "1.5 GB", "512.0 MB" into MB. +func parseSizeToMB(s string) int64 { + s = strings.TrimSpace(s) + if s == "" { + return 0 + } + + parts := strings.Fields(s) + if len(parts) != 2 { + return 0 + } + + val, err := strconv.ParseFloat(parts[0], 64) + if err != nil { + return 0 + } + + switch strings.ToUpper(parts[1]) { + case "GB": + return int64(val * 1024) + case "MB": + return int64(val) + case "KB": + return int64(val / 1024) + default: + return int64(val) + } +} diff --git a/controller/internal/report/pusher.go b/controller/internal/report/pusher.go new file mode 100644 index 0000000..60b913d --- /dev/null +++ b/controller/internal/report/pusher.go @@ -0,0 +1,86 @@ +package report + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "strings" + "time" + + "gitea.dooplex.hu/admin/felhom-controller/internal/config" +) + +// Pusher sends reports to the central hub. +type Pusher struct { + hubURL string + apiKey string + httpClient *http.Client + logger *log.Logger + enabled bool +} + +// NewPusher creates a new report pusher from hub configuration. +func NewPusher(cfg *config.HubConfig, logger *log.Logger) *Pusher { + return &Pusher{ + hubURL: strings.TrimRight(cfg.URL, "/"), + apiKey: cfg.APIKey, + httpClient: &http.Client{ + Timeout: 30 * time.Second, + }, + logger: logger, + enabled: cfg.Enabled, + } +} + +// Push sends a report to the hub. Retries 3 times with 5s backoff. +// Never returns error to caller — push failures should not affect controller operation. +func (p *Pusher) Push(report *Report) error { + if !p.enabled { + return nil + } + + data, err := json.Marshal(report) + if err != nil { + p.logger.Printf("[WARN] Hub report marshal failed: %v", err) + return nil + } + + url := p.hubURL + "/api/v1/report" + + var lastErr error + for attempt := 0; attempt < 3; attempt++ { + if attempt > 0 { + time.Sleep(5 * time.Second) + } + + req, err := http.NewRequest(http.MethodPost, url, bytes.NewReader(data)) + if err != nil { + lastErr = err + continue + } + req.Header.Set("Content-Type", "application/json") + if p.apiKey != "" { + req.Header.Set("Authorization", "Bearer "+p.apiKey) + } + + resp, err := p.httpClient.Do(req) + if err != nil { + lastErr = err + continue + } + io.Copy(io.Discard, resp.Body) + resp.Body.Close() + + if resp.StatusCode >= 200 && resp.StatusCode < 300 { + p.logger.Printf("[INFO] Hub report pushed successfully (%d bytes)", len(data)) + return nil + } + lastErr = fmt.Errorf("HTTP %d", resp.StatusCode) + } + + p.logger.Printf("[WARN] Hub report push failed after 3 attempts: %v", lastErr) + return nil +} diff --git a/controller/internal/report/types.go b/controller/internal/report/types.go new file mode 100644 index 0000000..c22ad9c --- /dev/null +++ b/controller/internal/report/types.go @@ -0,0 +1,85 @@ +package report + +import "time" + +// Report is the JSON payload pushed to the central hub. +type Report struct { + Version int `json:"version"` + CustomerID string `json:"customer_id"` + CustomerName string `json:"customer_name"` + ControllerVersion string `json:"controller_version"` + Timestamp time.Time `json:"timestamp"` + System SystemReport `json:"system"` + Storage []StorageReport `json:"storage"` + Containers ContainerReport `json:"containers"` + Backup BackupReport `json:"backup"` + Health HealthReport `json:"health"` + Stacks StacksReport `json:"stacks"` +} + +// SystemReport holds host-level system info. +type SystemReport struct { + Hostname string `json:"hostname"` + OS string `json:"os"` + Kernel string `json:"kernel"` + CPUModel string `json:"cpu_model"` + CPUCores int `json:"cpu_cores"` + UptimeSeconds int64 `json:"uptime_seconds"` + CPUPercent float64 `json:"cpu_percent"` + MemoryTotalMB uint64 `json:"memory_total_mb"` + MemoryUsedMB uint64 `json:"memory_used_mb"` + MemoryPercent float64 `json:"memory_percent"` + TemperatureCelsius float64 `json:"temperature_celsius"` + LoadAvg1 float64 `json:"load_avg_1"` + LoadAvg5 float64 `json:"load_avg_5"` + LoadAvg15 float64 `json:"load_avg_15"` +} + +// StorageReport holds disk usage for a mount point. +type StorageReport struct { + Mount string `json:"mount"` + TotalGB float64 `json:"total_gb"` + UsedGB float64 `json:"used_gb"` + Percent float64 `json:"percent"` +} + +// ContainerReport holds aggregate and per-container status. +type ContainerReport struct { + Total int `json:"total"` + Running int `json:"running"` + Stopped int `json:"stopped"` + Unhealthy int `json:"unhealthy"` + List []ContainerDetailReport `json:"list"` +} + +// ContainerDetailReport holds per-container info. +type ContainerDetailReport struct { + Name string `json:"name"` + State string `json:"state"` + CPUPercent float64 `json:"cpu_percent"` + MemoryMB float64 `json:"memory_mb"` +} + +// BackupReport holds backup subsystem status. +type BackupReport struct { + Enabled bool `json:"enabled"` + LastDBDump *time.Time `json:"last_db_dump,omitempty"` + LastSnapshot *time.Time `json:"last_snapshot,omitempty"` + SnapshotCount int `json:"snapshot_count"` + RepoSizeMB int64 `json:"repo_size_mb"` + LastIntegrityCheck *time.Time `json:"last_integrity_check,omitempty"` + IntegrityOK bool `json:"integrity_ok"` +} + +// HealthReport holds the aggregated health status. +type HealthReport struct { + Status string `json:"status"` + Issues []string `json:"issues"` + Warnings []string `json:"warnings"` +} + +// StacksReport holds stack deployment status. +type StacksReport struct { + Deployed []string `json:"deployed"` + Available []string `json:"available"` +} diff --git a/monitoring/DEPRECATED.md b/monitoring/DEPRECATED.md new file mode 100644 index 0000000..5f606f2 --- /dev/null +++ b/monitoring/DEPRECATED.md @@ -0,0 +1,35 @@ +# DEPRECATED — Legacy Monitoring Scripts + +> **Superseded by:** felhom-controller v0.4.0+ built-in monitoring +> **Date:** 2026-02-16 + +The following scripts in this directory were used for monitoring before the controller +had built-in health check support. They are **no longer needed** on nodes running +felhom-controller v0.4.0 or later. + +## Replaced scripts + +| Legacy script | Replaced by | Controller component | +|---------------|-------------|---------------------| +| `backup-healthcheck.sh` | System health scheduler job (every 5 min) | `internal/monitor/healthcheck.go` | +| `monitoring-setup.sh` | Controller reads `controller.yaml` directly | `internal/config/config.go` | +| `monitoring.conf.template` | `monitoring` section in `controller.yaml` | `controller.yaml` | +| `backup-healthcheck.service` | Controller's built-in scheduler | `internal/scheduler/scheduler.go` | +| `backup-healthcheck.timer` | Controller's built-in scheduler | `internal/scheduler/scheduler.go` | + +## What the controller handles natively (v0.6.0+) + +- **Heartbeat** ping every 5 minutes (controller process alive) +- **System health** check every 5 minutes (disk, memory, CPU, temp, Docker, protected containers) +- **DB dump** ping after daily database dumps +- **Backup** ping after daily restic snapshots +- **Backup integrity** check weekly (restic check) + +All pings are sent to a Healthchecks.io-compatible server configured in `controller.yaml`. +Empty or `CHANGEME` UUIDs are silently skipped. + +## Do NOT delete these files yet + +These scripts are kept for reference in case a customer is still running a pre-controller +setup. Once all customers are migrated to felhom-controller v0.4.0+, this directory +can be safely removed.