v0.6.0: healthcheck + hub reporting implementation

- Add heartbeat ping (every 5 min, controller alive signal)
- Add backup integrity check (weekly restic check, Sunday 04:00)
- Add Heartbeat + BackupIntegrity fields to PingUUIDsConfig
- Add HubConfig for central hub reporting
- Add report package (types, builder, pusher) for hub push
- Wire hub reporting into scheduler (configurable interval)
- Update controller.yaml.example with new monitoring + hub sections
- Add monitoring/DEPRECATED.md for legacy bash scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-16 13:19:08 +01:00
parent 94efc39c34
commit 97074e7a0c
8 changed files with 525 additions and 9 deletions
+31
View File
@@ -270,6 +270,37 @@ func (m *Manager) RunBackup(ctx context.Context) error {
return nil
}
// RunIntegrityCheck runs restic check and pings healthchecks with the result.
func (m *Manager) RunIntegrityCheck(ctx context.Context) error {
m.logger.Printf("[INFO] Starting restic integrity check")
start := time.Now()
if err := m.restic.EnsureInitialized(); err != nil {
m.logger.Printf("[ERROR] Restic init failed for integrity check: %v", err)
return err
}
err := m.restic.Check()
duration := time.Since(start)
uuid := m.cfg.Monitoring.PingUUIDs.BackupIntegrity
m.mu.Lock()
m.lastCheckTime = time.Now()
m.lastCheckOK = err == nil
m.mu.Unlock()
if err != nil {
m.logger.Printf("[ERROR] Restic integrity check failed (%s): %v", duration.Round(time.Second), err)
m.pinger.Fail(uuid, fmt.Sprintf("restic check failed: %v", err))
return err
}
m.logger.Printf("[INFO] Restic integrity check passed (%s)", duration.Round(time.Second))
m.pinger.Ping(uuid, fmt.Sprintf("restic check passed (%s)", duration.Round(time.Second)))
return nil
}
// RunFullBackup runs DB dumps followed by restic backup.
func (m *Manager) RunFullBackup(ctx context.Context) error {
m.mu.Lock()
+14 -3
View File
@@ -20,6 +20,7 @@ type Config struct {
Stacks StacksConfig `yaml:"stacks"`
Backup BackupConfig `yaml:"backup"`
Monitoring MonitoringConfig `yaml:"monitoring"`
Hub HubConfig `yaml:"hub"`
SelfUpdate SelfUpdateConfig `yaml:"self_update"`
Notifications NotificationsConfig `yaml:"notifications"`
Logging LoggingConfig `yaml:"logging"`
@@ -98,9 +99,11 @@ type MonitoringConfig struct {
}
type PingUUIDsConfig struct {
DBDump string `yaml:"db_dump"`
Backup string `yaml:"backup"`
SystemHealth string `yaml:"system_health"`
Heartbeat string `yaml:"heartbeat"`
DBDump string `yaml:"db_dump"`
Backup string `yaml:"backup"`
SystemHealth string `yaml:"system_health"`
BackupIntegrity string `yaml:"backup_integrity"`
}
type ThresholdsConfig struct {
@@ -136,6 +139,13 @@ type AssetsConfig struct {
SourceURL string `yaml:"source_url"` // Only used during build, not runtime
}
type HubConfig struct {
Enabled bool `yaml:"enabled"`
URL string `yaml:"url"`
APIKey string `yaml:"api_key"`
PushInterval string `yaml:"push_interval"`
}
// Load reads and parses the config file, applies defaults, and validates.
func Load(path string) (*Config, error) {
data, err := os.ReadFile(path)
@@ -198,6 +208,7 @@ func applyDefaults(cfg *Config) {
di(&cfg.Monitoring.Thresholds.CPUWarnPercent, 90)
di(&cfg.Monitoring.Thresholds.MemoryWarnPercent, 85)
di(&cfg.Monitoring.Thresholds.TemperatureWarnCelsius, 75)
d(&cfg.Hub.PushInterval, "15m")
d(&cfg.SelfUpdate.CheckInterval, "6h")
di(&cfg.SelfUpdate.HealthTimeoutSeconds, 60)
d(&cfg.Logging.Level, "info")
+230
View File
@@ -0,0 +1,230 @@
package report
import (
"strconv"
"strings"
"time"
"gitea.dooplex.hu/admin/felhom-controller/internal/backup"
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
"gitea.dooplex.hu/admin/felhom-controller/internal/metrics"
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
"gitea.dooplex.hu/admin/felhom-controller/internal/scheduler"
"gitea.dooplex.hu/admin/felhom-controller/internal/stacks"
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
)
// BuildReport collects current state from all subsystems and returns a Report.
func BuildReport(
cfg *config.Config,
stackMgr *stacks.Manager,
backupMgr *backup.Manager,
cpuCollector *system.CPUCollector,
metricsStore *metrics.MetricsStore,
version string,
) *Report {
r := &Report{
Version: 1,
CustomerID: cfg.Customer.ID,
CustomerName: cfg.Customer.Name,
ControllerVersion: version,
Timestamp: time.Now().UTC(),
}
// System info
staticInfo := metrics.GetStaticInfo()
sysInfo := system.GetInfo(cfg.Paths.HDDPath, cpuCollector)
r.System = SystemReport{
Hostname: staticInfo.Hostname,
OS: staticInfo.OS,
Kernel: staticInfo.Kernel,
CPUModel: staticInfo.CPUModel,
CPUCores: staticInfo.CPUCores,
UptimeSeconds: staticInfo.UptimeSeconds,
CPUPercent: sysInfo.CPUPercent,
MemoryTotalMB: sysInfo.TotalMemMB,
MemoryUsedMB: sysInfo.UsedMemMB,
MemoryPercent: sysInfo.MemPercent,
TemperatureCelsius: sysInfo.TemperatureCelsius,
LoadAvg1: sysInfo.LoadAvg1,
LoadAvg5: sysInfo.LoadAvg5,
LoadAvg15: sysInfo.LoadAvg15,
}
// Storage
r.Storage = []StorageReport{
{Mount: "/", TotalGB: sysInfo.DiskTotalGB, UsedGB: sysInfo.DiskUsedGB, Percent: sysInfo.DiskPercent},
}
if sysInfo.HDDConfigured {
r.Storage = append(r.Storage, StorageReport{
Mount: cfg.Paths.HDDPath,
TotalGB: sysInfo.HDDTotalGB,
UsedGB: sysInfo.HDDUsedGB,
Percent: sysInfo.HDDPercent,
})
}
// Containers
r.Containers = buildContainerReport(stackMgr, metricsStore)
// Backup
r.Backup = buildBackupReport(cfg, backupMgr)
// Health
healthReport := monitor.RunHealthCheck(cfg, cpuCollector)
r.Health = HealthReport{
Status: healthReport.Status,
Issues: healthReport.Issues,
Warnings: healthReport.Warnings,
}
if r.Health.Issues == nil {
r.Health.Issues = []string{}
}
if r.Health.Warnings == nil {
r.Health.Warnings = []string{}
}
// Stacks
r.Stacks = buildStacksReport(stackMgr)
return r
}
func buildContainerReport(stackMgr *stacks.Manager, metricsStore *metrics.MetricsStore) ContainerReport {
cr := ContainerReport{}
allStacks := stackMgr.GetStacks()
// Build a map of container stats from metrics store
statsMap := make(map[string]metrics.ContainerCurrentStats)
if metricsStore != nil {
if stats, err := metricsStore.QueryContainerSummary(); err == nil {
for _, s := range stats {
statsMap[s.ContainerName] = s
}
}
}
for _, s := range allStacks {
if !s.Deployed {
continue
}
for _, c := range s.Containers {
cr.Total++
switch c.State {
case stacks.StateRunning, stacks.StateStarting:
cr.Running++
case stacks.StateUnhealthy:
cr.Unhealthy++
cr.Running++ // unhealthy containers are still running
default:
cr.Stopped++
}
detail := ContainerDetailReport{
Name: c.Name,
State: string(c.State),
}
if cs, ok := statsMap[c.Name]; ok {
detail.CPUPercent = cs.CPUPercent
detail.MemoryMB = cs.MemUsageMB
}
cr.List = append(cr.List, detail)
}
}
if cr.List == nil {
cr.List = []ContainerDetailReport{}
}
return cr
}
func buildBackupReport(cfg *config.Config, backupMgr *backup.Manager) BackupReport {
br := BackupReport{
Enabled: cfg.Backup.Enabled,
}
if backupMgr == nil {
return br
}
nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule)
nextBackup := scheduler.NextDailyRun(cfg.Backup.ResticSchedule)
status := backupMgr.GetFullStatus(nextDBDump, nextBackup)
if status.LastDBDump != nil {
t := status.LastDBDump.LastRun
br.LastDBDump = &t
}
if status.LastBackup != nil {
t := status.LastBackup.LastRun
br.LastSnapshot = &t
}
if status.RepoStats != nil {
br.SnapshotCount = status.RepoStats.SnapshotCount
br.RepoSizeMB = parseSizeToMB(status.RepoStats.TotalSize)
}
if !status.LastCheckTime.IsZero() {
t := status.LastCheckTime
br.LastIntegrityCheck = &t
}
br.IntegrityOK = status.LastCheckOK
return br
}
func buildStacksReport(stackMgr *stacks.Manager) StacksReport {
sr := StacksReport{}
allStacks := stackMgr.GetStacks()
for _, s := range allStacks {
if s.Protected {
continue
}
if s.Deployed {
sr.Deployed = append(sr.Deployed, s.Name)
} else {
sr.Available = append(sr.Available, s.Name)
}
}
if sr.Deployed == nil {
sr.Deployed = []string{}
}
if sr.Available == nil {
sr.Available = []string{}
}
return sr
}
// parseSizeToMB parses a formatted size string like "1.5 GB", "512.0 MB" into MB.
func parseSizeToMB(s string) int64 {
s = strings.TrimSpace(s)
if s == "" {
return 0
}
parts := strings.Fields(s)
if len(parts) != 2 {
return 0
}
val, err := strconv.ParseFloat(parts[0], 64)
if err != nil {
return 0
}
switch strings.ToUpper(parts[1]) {
case "GB":
return int64(val * 1024)
case "MB":
return int64(val)
case "KB":
return int64(val / 1024)
default:
return int64(val)
}
}
+86
View File
@@ -0,0 +1,86 @@
package report
import (
"bytes"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"strings"
"time"
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
)
// Pusher sends reports to the central hub.
type Pusher struct {
hubURL string
apiKey string
httpClient *http.Client
logger *log.Logger
enabled bool
}
// NewPusher creates a new report pusher from hub configuration.
func NewPusher(cfg *config.HubConfig, logger *log.Logger) *Pusher {
return &Pusher{
hubURL: strings.TrimRight(cfg.URL, "/"),
apiKey: cfg.APIKey,
httpClient: &http.Client{
Timeout: 30 * time.Second,
},
logger: logger,
enabled: cfg.Enabled,
}
}
// Push sends a report to the hub. Retries 3 times with 5s backoff.
// Never returns error to caller — push failures should not affect controller operation.
func (p *Pusher) Push(report *Report) error {
if !p.enabled {
return nil
}
data, err := json.Marshal(report)
if err != nil {
p.logger.Printf("[WARN] Hub report marshal failed: %v", err)
return nil
}
url := p.hubURL + "/api/v1/report"
var lastErr error
for attempt := 0; attempt < 3; attempt++ {
if attempt > 0 {
time.Sleep(5 * time.Second)
}
req, err := http.NewRequest(http.MethodPost, url, bytes.NewReader(data))
if err != nil {
lastErr = err
continue
}
req.Header.Set("Content-Type", "application/json")
if p.apiKey != "" {
req.Header.Set("Authorization", "Bearer "+p.apiKey)
}
resp, err := p.httpClient.Do(req)
if err != nil {
lastErr = err
continue
}
io.Copy(io.Discard, resp.Body)
resp.Body.Close()
if resp.StatusCode >= 200 && resp.StatusCode < 300 {
p.logger.Printf("[INFO] Hub report pushed successfully (%d bytes)", len(data))
return nil
}
lastErr = fmt.Errorf("HTTP %d", resp.StatusCode)
}
p.logger.Printf("[WARN] Hub report push failed after 3 attempts: %v", lastErr)
return nil
}
+85
View File
@@ -0,0 +1,85 @@
package report
import "time"
// Report is the JSON payload pushed to the central hub.
type Report struct {
Version int `json:"version"`
CustomerID string `json:"customer_id"`
CustomerName string `json:"customer_name"`
ControllerVersion string `json:"controller_version"`
Timestamp time.Time `json:"timestamp"`
System SystemReport `json:"system"`
Storage []StorageReport `json:"storage"`
Containers ContainerReport `json:"containers"`
Backup BackupReport `json:"backup"`
Health HealthReport `json:"health"`
Stacks StacksReport `json:"stacks"`
}
// SystemReport holds host-level system info.
type SystemReport struct {
Hostname string `json:"hostname"`
OS string `json:"os"`
Kernel string `json:"kernel"`
CPUModel string `json:"cpu_model"`
CPUCores int `json:"cpu_cores"`
UptimeSeconds int64 `json:"uptime_seconds"`
CPUPercent float64 `json:"cpu_percent"`
MemoryTotalMB uint64 `json:"memory_total_mb"`
MemoryUsedMB uint64 `json:"memory_used_mb"`
MemoryPercent float64 `json:"memory_percent"`
TemperatureCelsius float64 `json:"temperature_celsius"`
LoadAvg1 float64 `json:"load_avg_1"`
LoadAvg5 float64 `json:"load_avg_5"`
LoadAvg15 float64 `json:"load_avg_15"`
}
// StorageReport holds disk usage for a mount point.
type StorageReport struct {
Mount string `json:"mount"`
TotalGB float64 `json:"total_gb"`
UsedGB float64 `json:"used_gb"`
Percent float64 `json:"percent"`
}
// ContainerReport holds aggregate and per-container status.
type ContainerReport struct {
Total int `json:"total"`
Running int `json:"running"`
Stopped int `json:"stopped"`
Unhealthy int `json:"unhealthy"`
List []ContainerDetailReport `json:"list"`
}
// ContainerDetailReport holds per-container info.
type ContainerDetailReport struct {
Name string `json:"name"`
State string `json:"state"`
CPUPercent float64 `json:"cpu_percent"`
MemoryMB float64 `json:"memory_mb"`
}
// BackupReport holds backup subsystem status.
type BackupReport struct {
Enabled bool `json:"enabled"`
LastDBDump *time.Time `json:"last_db_dump,omitempty"`
LastSnapshot *time.Time `json:"last_snapshot,omitempty"`
SnapshotCount int `json:"snapshot_count"`
RepoSizeMB int64 `json:"repo_size_mb"`
LastIntegrityCheck *time.Time `json:"last_integrity_check,omitempty"`
IntegrityOK bool `json:"integrity_ok"`
}
// HealthReport holds the aggregated health status.
type HealthReport struct {
Status string `json:"status"`
Issues []string `json:"issues"`
Warnings []string `json:"warnings"`
}
// StacksReport holds stack deployment status.
type StacksReport struct {
Deployed []string `json:"deployed"`
Available []string `json:"available"`
}