v0.6.0: healthcheck + hub reporting implementation
- Add heartbeat ping (every 5 min, controller alive signal) - Add backup integrity check (weekly restic check, Sunday 04:00) - Add Heartbeat + BackupIntegrity fields to PingUUIDsConfig - Add HubConfig for central hub reporting - Add report package (types, builder, pusher) for hub push - Wire hub reporting into scheduler (configurable interval) - Update controller.yaml.example with new monitoring + hub sections - Add monitoring/DEPRECATED.md for legacy bash scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,7 @@ import (
|
|||||||
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||||||
"gitea.dooplex.hu/admin/felhom-controller/internal/metrics"
|
"gitea.dooplex.hu/admin/felhom-controller/internal/metrics"
|
||||||
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
|
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
|
||||||
|
"gitea.dooplex.hu/admin/felhom-controller/internal/report"
|
||||||
"gitea.dooplex.hu/admin/felhom-controller/internal/scheduler"
|
"gitea.dooplex.hu/admin/felhom-controller/internal/scheduler"
|
||||||
"gitea.dooplex.hu/admin/felhom-controller/internal/stacks"
|
"gitea.dooplex.hu/admin/felhom-controller/internal/stacks"
|
||||||
catalogsync "gitea.dooplex.hu/admin/felhom-controller/internal/sync"
|
catalogsync "gitea.dooplex.hu/admin/felhom-controller/internal/sync"
|
||||||
@@ -118,16 +119,22 @@ func main() {
|
|||||||
return stackMgr.ScanStacks()
|
return stackMgr.ScanStacks()
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// Heartbeat — lightweight "I'm alive" signal
|
||||||
|
sched.Every("heartbeat", 5*time.Minute, func(ctx context.Context) error {
|
||||||
|
pinger.Ping(cfg.Monitoring.PingUUIDs.Heartbeat, "")
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
|
||||||
// System health ping
|
// System health ping
|
||||||
healthInterval, err := time.ParseDuration(cfg.Monitoring.SystemHealthInterval)
|
healthInterval, err := time.ParseDuration(cfg.Monitoring.SystemHealthInterval)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
healthInterval = 5 * time.Minute
|
healthInterval = 5 * time.Minute
|
||||||
}
|
}
|
||||||
sched.Every("system-health", healthInterval, func(ctx context.Context) error {
|
sched.Every("system-health", healthInterval, func(ctx context.Context) error {
|
||||||
report := monitor.RunHealthCheck(cfg, cpuCollector)
|
healthReport := monitor.RunHealthCheck(cfg, cpuCollector)
|
||||||
body := report.FormatMessage()
|
body := healthReport.FormatMessage()
|
||||||
healthUUID := cfg.Monitoring.PingUUIDs.SystemHealth
|
healthUUID := cfg.Monitoring.PingUUIDs.SystemHealth
|
||||||
if report.Status == "fail" {
|
if healthReport.Status == "fail" {
|
||||||
pinger.Fail(healthUUID, body)
|
pinger.Fail(healthUUID, body)
|
||||||
} else {
|
} else {
|
||||||
pinger.Ping(healthUUID, body)
|
pinger.Ping(healthUUID, body)
|
||||||
@@ -144,6 +151,14 @@ func main() {
|
|||||||
return backupMgr.RunBackup(ctx)
|
return backupMgr.RunBackup(ctx)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
// Weekly integrity check — Sunday 04:00
|
||||||
|
sched.Daily("backup-integrity", "04:00", func(ctx context.Context) error {
|
||||||
|
if time.Now().Weekday() != time.Sunday {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return backupMgr.RunIntegrityCheck(ctx)
|
||||||
|
})
|
||||||
|
|
||||||
// Cache refresh: every 5 minutes
|
// Cache refresh: every 5 minutes
|
||||||
sched.Every("backup-cache", 5*time.Minute, func(ctx context.Context) error {
|
sched.Every("backup-cache", 5*time.Minute, func(ctx context.Context) error {
|
||||||
nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule)
|
nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule)
|
||||||
@@ -165,6 +180,20 @@ func main() {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --- Central hub reporting ---
|
||||||
|
if cfg.Hub.Enabled && cfg.Hub.URL != "" {
|
||||||
|
pushInterval, err := time.ParseDuration(cfg.Hub.PushInterval)
|
||||||
|
if err != nil {
|
||||||
|
pushInterval = 15 * time.Minute
|
||||||
|
}
|
||||||
|
pusher := report.NewPusher(&cfg.Hub, logger)
|
||||||
|
sched.Every("hub-report", pushInterval, func(ctx context.Context) error {
|
||||||
|
r := report.BuildReport(cfg, stackMgr, backupMgr, cpuCollector, metricsStore, Version)
|
||||||
|
return pusher.Push(r)
|
||||||
|
})
|
||||||
|
logger.Printf("[INFO] Hub reporting enabled (every %s to %s)", pushInterval, cfg.Hub.URL)
|
||||||
|
}
|
||||||
|
|
||||||
sched.Start(ctx)
|
sched.Start(ctx)
|
||||||
defer sched.Stop()
|
defer sched.Stop()
|
||||||
|
|
||||||
|
|||||||
@@ -80,9 +80,11 @@ monitoring:
|
|||||||
enabled: true
|
enabled: true
|
||||||
healthchecks_base: "https://status.felhom.eu"
|
healthchecks_base: "https://status.felhom.eu"
|
||||||
ping_uuids:
|
ping_uuids:
|
||||||
db_dump: "CHANGEME-uuid-for-db-dump"
|
heartbeat: "" # Every 5 min — controller process alive
|
||||||
backup: "CHANGEME-uuid-for-backup"
|
system_health: "CHANGEME-uuid-for-system-health" # Every 5 min — comprehensive system check
|
||||||
system_health: "CHANGEME-uuid-for-system-health"
|
db_dump: "CHANGEME-uuid-for-db-dump" # Daily — after database dumps
|
||||||
|
backup: "CHANGEME-uuid-for-backup" # Daily — after restic snapshot
|
||||||
|
backup_integrity: "" # Weekly (Sunday) — restic check
|
||||||
system_health_interval: "5m"
|
system_health_interval: "5m"
|
||||||
health_check_schedule: "06:00"
|
health_check_schedule: "06:00"
|
||||||
thresholds:
|
thresholds:
|
||||||
@@ -93,6 +95,13 @@ monitoring:
|
|||||||
memory_warn_percent: 85
|
memory_warn_percent: 85
|
||||||
temperature_warn_celsius: 75
|
temperature_warn_celsius: 75
|
||||||
|
|
||||||
|
# --- Central hub (operator dashboard) ---
|
||||||
|
hub:
|
||||||
|
enabled: false # Enable central reporting
|
||||||
|
url: "https://hub.felhom.eu" # Hub API endpoint
|
||||||
|
api_key: "" # Shared secret for authentication
|
||||||
|
push_interval: "15m" # How often to push reports
|
||||||
|
|
||||||
# --- Self-update ---
|
# --- Self-update ---
|
||||||
self_update:
|
self_update:
|
||||||
enabled: true
|
enabled: true
|
||||||
|
|||||||
@@ -270,6 +270,37 @@ func (m *Manager) RunBackup(ctx context.Context) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RunIntegrityCheck runs restic check and pings healthchecks with the result.
|
||||||
|
func (m *Manager) RunIntegrityCheck(ctx context.Context) error {
|
||||||
|
m.logger.Printf("[INFO] Starting restic integrity check")
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
|
if err := m.restic.EnsureInitialized(); err != nil {
|
||||||
|
m.logger.Printf("[ERROR] Restic init failed for integrity check: %v", err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
err := m.restic.Check()
|
||||||
|
duration := time.Since(start)
|
||||||
|
|
||||||
|
uuid := m.cfg.Monitoring.PingUUIDs.BackupIntegrity
|
||||||
|
|
||||||
|
m.mu.Lock()
|
||||||
|
m.lastCheckTime = time.Now()
|
||||||
|
m.lastCheckOK = err == nil
|
||||||
|
m.mu.Unlock()
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
m.logger.Printf("[ERROR] Restic integrity check failed (%s): %v", duration.Round(time.Second), err)
|
||||||
|
m.pinger.Fail(uuid, fmt.Sprintf("restic check failed: %v", err))
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
m.logger.Printf("[INFO] Restic integrity check passed (%s)", duration.Round(time.Second))
|
||||||
|
m.pinger.Ping(uuid, fmt.Sprintf("restic check passed (%s)", duration.Round(time.Second)))
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// RunFullBackup runs DB dumps followed by restic backup.
|
// RunFullBackup runs DB dumps followed by restic backup.
|
||||||
func (m *Manager) RunFullBackup(ctx context.Context) error {
|
func (m *Manager) RunFullBackup(ctx context.Context) error {
|
||||||
m.mu.Lock()
|
m.mu.Lock()
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ type Config struct {
|
|||||||
Stacks StacksConfig `yaml:"stacks"`
|
Stacks StacksConfig `yaml:"stacks"`
|
||||||
Backup BackupConfig `yaml:"backup"`
|
Backup BackupConfig `yaml:"backup"`
|
||||||
Monitoring MonitoringConfig `yaml:"monitoring"`
|
Monitoring MonitoringConfig `yaml:"monitoring"`
|
||||||
|
Hub HubConfig `yaml:"hub"`
|
||||||
SelfUpdate SelfUpdateConfig `yaml:"self_update"`
|
SelfUpdate SelfUpdateConfig `yaml:"self_update"`
|
||||||
Notifications NotificationsConfig `yaml:"notifications"`
|
Notifications NotificationsConfig `yaml:"notifications"`
|
||||||
Logging LoggingConfig `yaml:"logging"`
|
Logging LoggingConfig `yaml:"logging"`
|
||||||
@@ -98,9 +99,11 @@ type MonitoringConfig struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type PingUUIDsConfig struct {
|
type PingUUIDsConfig struct {
|
||||||
DBDump string `yaml:"db_dump"`
|
Heartbeat string `yaml:"heartbeat"`
|
||||||
Backup string `yaml:"backup"`
|
DBDump string `yaml:"db_dump"`
|
||||||
SystemHealth string `yaml:"system_health"`
|
Backup string `yaml:"backup"`
|
||||||
|
SystemHealth string `yaml:"system_health"`
|
||||||
|
BackupIntegrity string `yaml:"backup_integrity"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type ThresholdsConfig struct {
|
type ThresholdsConfig struct {
|
||||||
@@ -136,6 +139,13 @@ type AssetsConfig struct {
|
|||||||
SourceURL string `yaml:"source_url"` // Only used during build, not runtime
|
SourceURL string `yaml:"source_url"` // Only used during build, not runtime
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type HubConfig struct {
|
||||||
|
Enabled bool `yaml:"enabled"`
|
||||||
|
URL string `yaml:"url"`
|
||||||
|
APIKey string `yaml:"api_key"`
|
||||||
|
PushInterval string `yaml:"push_interval"`
|
||||||
|
}
|
||||||
|
|
||||||
// Load reads and parses the config file, applies defaults, and validates.
|
// Load reads and parses the config file, applies defaults, and validates.
|
||||||
func Load(path string) (*Config, error) {
|
func Load(path string) (*Config, error) {
|
||||||
data, err := os.ReadFile(path)
|
data, err := os.ReadFile(path)
|
||||||
@@ -198,6 +208,7 @@ func applyDefaults(cfg *Config) {
|
|||||||
di(&cfg.Monitoring.Thresholds.CPUWarnPercent, 90)
|
di(&cfg.Monitoring.Thresholds.CPUWarnPercent, 90)
|
||||||
di(&cfg.Monitoring.Thresholds.MemoryWarnPercent, 85)
|
di(&cfg.Monitoring.Thresholds.MemoryWarnPercent, 85)
|
||||||
di(&cfg.Monitoring.Thresholds.TemperatureWarnCelsius, 75)
|
di(&cfg.Monitoring.Thresholds.TemperatureWarnCelsius, 75)
|
||||||
|
d(&cfg.Hub.PushInterval, "15m")
|
||||||
d(&cfg.SelfUpdate.CheckInterval, "6h")
|
d(&cfg.SelfUpdate.CheckInterval, "6h")
|
||||||
di(&cfg.SelfUpdate.HealthTimeoutSeconds, 60)
|
di(&cfg.SelfUpdate.HealthTimeoutSeconds, 60)
|
||||||
d(&cfg.Logging.Level, "info")
|
d(&cfg.Logging.Level, "info")
|
||||||
|
|||||||
@@ -0,0 +1,230 @@
|
|||||||
|
package report
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"gitea.dooplex.hu/admin/felhom-controller/internal/backup"
|
||||||
|
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||||||
|
"gitea.dooplex.hu/admin/felhom-controller/internal/metrics"
|
||||||
|
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
|
||||||
|
"gitea.dooplex.hu/admin/felhom-controller/internal/scheduler"
|
||||||
|
"gitea.dooplex.hu/admin/felhom-controller/internal/stacks"
|
||||||
|
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
|
||||||
|
)
|
||||||
|
|
||||||
|
// BuildReport collects current state from all subsystems and returns a Report.
|
||||||
|
func BuildReport(
|
||||||
|
cfg *config.Config,
|
||||||
|
stackMgr *stacks.Manager,
|
||||||
|
backupMgr *backup.Manager,
|
||||||
|
cpuCollector *system.CPUCollector,
|
||||||
|
metricsStore *metrics.MetricsStore,
|
||||||
|
version string,
|
||||||
|
) *Report {
|
||||||
|
r := &Report{
|
||||||
|
Version: 1,
|
||||||
|
CustomerID: cfg.Customer.ID,
|
||||||
|
CustomerName: cfg.Customer.Name,
|
||||||
|
ControllerVersion: version,
|
||||||
|
Timestamp: time.Now().UTC(),
|
||||||
|
}
|
||||||
|
|
||||||
|
// System info
|
||||||
|
staticInfo := metrics.GetStaticInfo()
|
||||||
|
sysInfo := system.GetInfo(cfg.Paths.HDDPath, cpuCollector)
|
||||||
|
|
||||||
|
r.System = SystemReport{
|
||||||
|
Hostname: staticInfo.Hostname,
|
||||||
|
OS: staticInfo.OS,
|
||||||
|
Kernel: staticInfo.Kernel,
|
||||||
|
CPUModel: staticInfo.CPUModel,
|
||||||
|
CPUCores: staticInfo.CPUCores,
|
||||||
|
UptimeSeconds: staticInfo.UptimeSeconds,
|
||||||
|
CPUPercent: sysInfo.CPUPercent,
|
||||||
|
MemoryTotalMB: sysInfo.TotalMemMB,
|
||||||
|
MemoryUsedMB: sysInfo.UsedMemMB,
|
||||||
|
MemoryPercent: sysInfo.MemPercent,
|
||||||
|
TemperatureCelsius: sysInfo.TemperatureCelsius,
|
||||||
|
LoadAvg1: sysInfo.LoadAvg1,
|
||||||
|
LoadAvg5: sysInfo.LoadAvg5,
|
||||||
|
LoadAvg15: sysInfo.LoadAvg15,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Storage
|
||||||
|
r.Storage = []StorageReport{
|
||||||
|
{Mount: "/", TotalGB: sysInfo.DiskTotalGB, UsedGB: sysInfo.DiskUsedGB, Percent: sysInfo.DiskPercent},
|
||||||
|
}
|
||||||
|
if sysInfo.HDDConfigured {
|
||||||
|
r.Storage = append(r.Storage, StorageReport{
|
||||||
|
Mount: cfg.Paths.HDDPath,
|
||||||
|
TotalGB: sysInfo.HDDTotalGB,
|
||||||
|
UsedGB: sysInfo.HDDUsedGB,
|
||||||
|
Percent: sysInfo.HDDPercent,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Containers
|
||||||
|
r.Containers = buildContainerReport(stackMgr, metricsStore)
|
||||||
|
|
||||||
|
// Backup
|
||||||
|
r.Backup = buildBackupReport(cfg, backupMgr)
|
||||||
|
|
||||||
|
// Health
|
||||||
|
healthReport := monitor.RunHealthCheck(cfg, cpuCollector)
|
||||||
|
r.Health = HealthReport{
|
||||||
|
Status: healthReport.Status,
|
||||||
|
Issues: healthReport.Issues,
|
||||||
|
Warnings: healthReport.Warnings,
|
||||||
|
}
|
||||||
|
if r.Health.Issues == nil {
|
||||||
|
r.Health.Issues = []string{}
|
||||||
|
}
|
||||||
|
if r.Health.Warnings == nil {
|
||||||
|
r.Health.Warnings = []string{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stacks
|
||||||
|
r.Stacks = buildStacksReport(stackMgr)
|
||||||
|
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildContainerReport(stackMgr *stacks.Manager, metricsStore *metrics.MetricsStore) ContainerReport {
|
||||||
|
cr := ContainerReport{}
|
||||||
|
|
||||||
|
allStacks := stackMgr.GetStacks()
|
||||||
|
|
||||||
|
// Build a map of container stats from metrics store
|
||||||
|
statsMap := make(map[string]metrics.ContainerCurrentStats)
|
||||||
|
if metricsStore != nil {
|
||||||
|
if stats, err := metricsStore.QueryContainerSummary(); err == nil {
|
||||||
|
for _, s := range stats {
|
||||||
|
statsMap[s.ContainerName] = s
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, s := range allStacks {
|
||||||
|
if !s.Deployed {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, c := range s.Containers {
|
||||||
|
cr.Total++
|
||||||
|
switch c.State {
|
||||||
|
case stacks.StateRunning, stacks.StateStarting:
|
||||||
|
cr.Running++
|
||||||
|
case stacks.StateUnhealthy:
|
||||||
|
cr.Unhealthy++
|
||||||
|
cr.Running++ // unhealthy containers are still running
|
||||||
|
default:
|
||||||
|
cr.Stopped++
|
||||||
|
}
|
||||||
|
|
||||||
|
detail := ContainerDetailReport{
|
||||||
|
Name: c.Name,
|
||||||
|
State: string(c.State),
|
||||||
|
}
|
||||||
|
if cs, ok := statsMap[c.Name]; ok {
|
||||||
|
detail.CPUPercent = cs.CPUPercent
|
||||||
|
detail.MemoryMB = cs.MemUsageMB
|
||||||
|
}
|
||||||
|
cr.List = append(cr.List, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if cr.List == nil {
|
||||||
|
cr.List = []ContainerDetailReport{}
|
||||||
|
}
|
||||||
|
|
||||||
|
return cr
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildBackupReport(cfg *config.Config, backupMgr *backup.Manager) BackupReport {
|
||||||
|
br := BackupReport{
|
||||||
|
Enabled: cfg.Backup.Enabled,
|
||||||
|
}
|
||||||
|
|
||||||
|
if backupMgr == nil {
|
||||||
|
return br
|
||||||
|
}
|
||||||
|
|
||||||
|
nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule)
|
||||||
|
nextBackup := scheduler.NextDailyRun(cfg.Backup.ResticSchedule)
|
||||||
|
status := backupMgr.GetFullStatus(nextDBDump, nextBackup)
|
||||||
|
|
||||||
|
if status.LastDBDump != nil {
|
||||||
|
t := status.LastDBDump.LastRun
|
||||||
|
br.LastDBDump = &t
|
||||||
|
}
|
||||||
|
if status.LastBackup != nil {
|
||||||
|
t := status.LastBackup.LastRun
|
||||||
|
br.LastSnapshot = &t
|
||||||
|
}
|
||||||
|
if status.RepoStats != nil {
|
||||||
|
br.SnapshotCount = status.RepoStats.SnapshotCount
|
||||||
|
br.RepoSizeMB = parseSizeToMB(status.RepoStats.TotalSize)
|
||||||
|
}
|
||||||
|
if !status.LastCheckTime.IsZero() {
|
||||||
|
t := status.LastCheckTime
|
||||||
|
br.LastIntegrityCheck = &t
|
||||||
|
}
|
||||||
|
br.IntegrityOK = status.LastCheckOK
|
||||||
|
|
||||||
|
return br
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildStacksReport(stackMgr *stacks.Manager) StacksReport {
|
||||||
|
sr := StacksReport{}
|
||||||
|
allStacks := stackMgr.GetStacks()
|
||||||
|
|
||||||
|
for _, s := range allStacks {
|
||||||
|
if s.Protected {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if s.Deployed {
|
||||||
|
sr.Deployed = append(sr.Deployed, s.Name)
|
||||||
|
} else {
|
||||||
|
sr.Available = append(sr.Available, s.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if sr.Deployed == nil {
|
||||||
|
sr.Deployed = []string{}
|
||||||
|
}
|
||||||
|
if sr.Available == nil {
|
||||||
|
sr.Available = []string{}
|
||||||
|
}
|
||||||
|
|
||||||
|
return sr
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseSizeToMB parses a formatted size string like "1.5 GB", "512.0 MB" into MB.
|
||||||
|
func parseSizeToMB(s string) int64 {
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
if s == "" {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
parts := strings.Fields(s)
|
||||||
|
if len(parts) != 2 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
val, err := strconv.ParseFloat(parts[0], 64)
|
||||||
|
if err != nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
switch strings.ToUpper(parts[1]) {
|
||||||
|
case "GB":
|
||||||
|
return int64(val * 1024)
|
||||||
|
case "MB":
|
||||||
|
return int64(val)
|
||||||
|
case "KB":
|
||||||
|
return int64(val / 1024)
|
||||||
|
default:
|
||||||
|
return int64(val)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,86 @@
|
|||||||
|
package report
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"log"
|
||||||
|
"net/http"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Pusher sends reports to the central hub.
|
||||||
|
type Pusher struct {
|
||||||
|
hubURL string
|
||||||
|
apiKey string
|
||||||
|
httpClient *http.Client
|
||||||
|
logger *log.Logger
|
||||||
|
enabled bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewPusher creates a new report pusher from hub configuration.
|
||||||
|
func NewPusher(cfg *config.HubConfig, logger *log.Logger) *Pusher {
|
||||||
|
return &Pusher{
|
||||||
|
hubURL: strings.TrimRight(cfg.URL, "/"),
|
||||||
|
apiKey: cfg.APIKey,
|
||||||
|
httpClient: &http.Client{
|
||||||
|
Timeout: 30 * time.Second,
|
||||||
|
},
|
||||||
|
logger: logger,
|
||||||
|
enabled: cfg.Enabled,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Push sends a report to the hub. Retries 3 times with 5s backoff.
|
||||||
|
// Never returns error to caller — push failures should not affect controller operation.
|
||||||
|
func (p *Pusher) Push(report *Report) error {
|
||||||
|
if !p.enabled {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := json.Marshal(report)
|
||||||
|
if err != nil {
|
||||||
|
p.logger.Printf("[WARN] Hub report marshal failed: %v", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
url := p.hubURL + "/api/v1/report"
|
||||||
|
|
||||||
|
var lastErr error
|
||||||
|
for attempt := 0; attempt < 3; attempt++ {
|
||||||
|
if attempt > 0 {
|
||||||
|
time.Sleep(5 * time.Second)
|
||||||
|
}
|
||||||
|
|
||||||
|
req, err := http.NewRequest(http.MethodPost, url, bytes.NewReader(data))
|
||||||
|
if err != nil {
|
||||||
|
lastErr = err
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
req.Header.Set("Content-Type", "application/json")
|
||||||
|
if p.apiKey != "" {
|
||||||
|
req.Header.Set("Authorization", "Bearer "+p.apiKey)
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, err := p.httpClient.Do(req)
|
||||||
|
if err != nil {
|
||||||
|
lastErr = err
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
io.Copy(io.Discard, resp.Body)
|
||||||
|
resp.Body.Close()
|
||||||
|
|
||||||
|
if resp.StatusCode >= 200 && resp.StatusCode < 300 {
|
||||||
|
p.logger.Printf("[INFO] Hub report pushed successfully (%d bytes)", len(data))
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
lastErr = fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||||
|
}
|
||||||
|
|
||||||
|
p.logger.Printf("[WARN] Hub report push failed after 3 attempts: %v", lastErr)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
@@ -0,0 +1,85 @@
|
|||||||
|
package report
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
// Report is the JSON payload pushed to the central hub.
|
||||||
|
type Report struct {
|
||||||
|
Version int `json:"version"`
|
||||||
|
CustomerID string `json:"customer_id"`
|
||||||
|
CustomerName string `json:"customer_name"`
|
||||||
|
ControllerVersion string `json:"controller_version"`
|
||||||
|
Timestamp time.Time `json:"timestamp"`
|
||||||
|
System SystemReport `json:"system"`
|
||||||
|
Storage []StorageReport `json:"storage"`
|
||||||
|
Containers ContainerReport `json:"containers"`
|
||||||
|
Backup BackupReport `json:"backup"`
|
||||||
|
Health HealthReport `json:"health"`
|
||||||
|
Stacks StacksReport `json:"stacks"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// SystemReport holds host-level system info.
|
||||||
|
type SystemReport struct {
|
||||||
|
Hostname string `json:"hostname"`
|
||||||
|
OS string `json:"os"`
|
||||||
|
Kernel string `json:"kernel"`
|
||||||
|
CPUModel string `json:"cpu_model"`
|
||||||
|
CPUCores int `json:"cpu_cores"`
|
||||||
|
UptimeSeconds int64 `json:"uptime_seconds"`
|
||||||
|
CPUPercent float64 `json:"cpu_percent"`
|
||||||
|
MemoryTotalMB uint64 `json:"memory_total_mb"`
|
||||||
|
MemoryUsedMB uint64 `json:"memory_used_mb"`
|
||||||
|
MemoryPercent float64 `json:"memory_percent"`
|
||||||
|
TemperatureCelsius float64 `json:"temperature_celsius"`
|
||||||
|
LoadAvg1 float64 `json:"load_avg_1"`
|
||||||
|
LoadAvg5 float64 `json:"load_avg_5"`
|
||||||
|
LoadAvg15 float64 `json:"load_avg_15"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// StorageReport holds disk usage for a mount point.
|
||||||
|
type StorageReport struct {
|
||||||
|
Mount string `json:"mount"`
|
||||||
|
TotalGB float64 `json:"total_gb"`
|
||||||
|
UsedGB float64 `json:"used_gb"`
|
||||||
|
Percent float64 `json:"percent"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainerReport holds aggregate and per-container status.
|
||||||
|
type ContainerReport struct {
|
||||||
|
Total int `json:"total"`
|
||||||
|
Running int `json:"running"`
|
||||||
|
Stopped int `json:"stopped"`
|
||||||
|
Unhealthy int `json:"unhealthy"`
|
||||||
|
List []ContainerDetailReport `json:"list"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ContainerDetailReport holds per-container info.
|
||||||
|
type ContainerDetailReport struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
State string `json:"state"`
|
||||||
|
CPUPercent float64 `json:"cpu_percent"`
|
||||||
|
MemoryMB float64 `json:"memory_mb"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// BackupReport holds backup subsystem status.
|
||||||
|
type BackupReport struct {
|
||||||
|
Enabled bool `json:"enabled"`
|
||||||
|
LastDBDump *time.Time `json:"last_db_dump,omitempty"`
|
||||||
|
LastSnapshot *time.Time `json:"last_snapshot,omitempty"`
|
||||||
|
SnapshotCount int `json:"snapshot_count"`
|
||||||
|
RepoSizeMB int64 `json:"repo_size_mb"`
|
||||||
|
LastIntegrityCheck *time.Time `json:"last_integrity_check,omitempty"`
|
||||||
|
IntegrityOK bool `json:"integrity_ok"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// HealthReport holds the aggregated health status.
|
||||||
|
type HealthReport struct {
|
||||||
|
Status string `json:"status"`
|
||||||
|
Issues []string `json:"issues"`
|
||||||
|
Warnings []string `json:"warnings"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// StacksReport holds stack deployment status.
|
||||||
|
type StacksReport struct {
|
||||||
|
Deployed []string `json:"deployed"`
|
||||||
|
Available []string `json:"available"`
|
||||||
|
}
|
||||||
@@ -0,0 +1,35 @@
|
|||||||
|
# DEPRECATED — Legacy Monitoring Scripts
|
||||||
|
|
||||||
|
> **Superseded by:** felhom-controller v0.4.0+ built-in monitoring
|
||||||
|
> **Date:** 2026-02-16
|
||||||
|
|
||||||
|
The following scripts in this directory were used for monitoring before the controller
|
||||||
|
had built-in health check support. They are **no longer needed** on nodes running
|
||||||
|
felhom-controller v0.4.0 or later.
|
||||||
|
|
||||||
|
## Replaced scripts
|
||||||
|
|
||||||
|
| Legacy script | Replaced by | Controller component |
|
||||||
|
|---------------|-------------|---------------------|
|
||||||
|
| `backup-healthcheck.sh` | System health scheduler job (every 5 min) | `internal/monitor/healthcheck.go` |
|
||||||
|
| `monitoring-setup.sh` | Controller reads `controller.yaml` directly | `internal/config/config.go` |
|
||||||
|
| `monitoring.conf.template` | `monitoring` section in `controller.yaml` | `controller.yaml` |
|
||||||
|
| `backup-healthcheck.service` | Controller's built-in scheduler | `internal/scheduler/scheduler.go` |
|
||||||
|
| `backup-healthcheck.timer` | Controller's built-in scheduler | `internal/scheduler/scheduler.go` |
|
||||||
|
|
||||||
|
## What the controller handles natively (v0.6.0+)
|
||||||
|
|
||||||
|
- **Heartbeat** ping every 5 minutes (controller process alive)
|
||||||
|
- **System health** check every 5 minutes (disk, memory, CPU, temp, Docker, protected containers)
|
||||||
|
- **DB dump** ping after daily database dumps
|
||||||
|
- **Backup** ping after daily restic snapshots
|
||||||
|
- **Backup integrity** check weekly (restic check)
|
||||||
|
|
||||||
|
All pings are sent to a Healthchecks.io-compatible server configured in `controller.yaml`.
|
||||||
|
Empty or `CHANGEME` UUIDs are silently skipped.
|
||||||
|
|
||||||
|
## Do NOT delete these files yet
|
||||||
|
|
||||||
|
These scripts are kept for reference in case a customer is still running a pre-controller
|
||||||
|
setup. Once all customers are migrated to felhom-controller v0.4.0+, this directory
|
||||||
|
can be safely removed.
|
||||||
Reference in New Issue
Block a user