v0.6.0: healthcheck + hub reporting implementation
- Add heartbeat ping (every 5 min, controller alive signal) - Add backup integrity check (weekly restic check, Sunday 04:00) - Add Heartbeat + BackupIntegrity fields to PingUUIDsConfig - Add HubConfig for central hub reporting - Add report package (types, builder, pusher) for hub push - Wire hub reporting into scheduler (configurable interval) - Update controller.yaml.example with new monitoring + hub sections - Add monitoring/DEPRECATED.md for legacy bash scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,230 @@
|
||||
package report
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/backup"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/metrics"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/monitor"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/scheduler"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/stacks"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
|
||||
)
|
||||
|
||||
// BuildReport collects current state from all subsystems and returns a Report.
|
||||
func BuildReport(
|
||||
cfg *config.Config,
|
||||
stackMgr *stacks.Manager,
|
||||
backupMgr *backup.Manager,
|
||||
cpuCollector *system.CPUCollector,
|
||||
metricsStore *metrics.MetricsStore,
|
||||
version string,
|
||||
) *Report {
|
||||
r := &Report{
|
||||
Version: 1,
|
||||
CustomerID: cfg.Customer.ID,
|
||||
CustomerName: cfg.Customer.Name,
|
||||
ControllerVersion: version,
|
||||
Timestamp: time.Now().UTC(),
|
||||
}
|
||||
|
||||
// System info
|
||||
staticInfo := metrics.GetStaticInfo()
|
||||
sysInfo := system.GetInfo(cfg.Paths.HDDPath, cpuCollector)
|
||||
|
||||
r.System = SystemReport{
|
||||
Hostname: staticInfo.Hostname,
|
||||
OS: staticInfo.OS,
|
||||
Kernel: staticInfo.Kernel,
|
||||
CPUModel: staticInfo.CPUModel,
|
||||
CPUCores: staticInfo.CPUCores,
|
||||
UptimeSeconds: staticInfo.UptimeSeconds,
|
||||
CPUPercent: sysInfo.CPUPercent,
|
||||
MemoryTotalMB: sysInfo.TotalMemMB,
|
||||
MemoryUsedMB: sysInfo.UsedMemMB,
|
||||
MemoryPercent: sysInfo.MemPercent,
|
||||
TemperatureCelsius: sysInfo.TemperatureCelsius,
|
||||
LoadAvg1: sysInfo.LoadAvg1,
|
||||
LoadAvg5: sysInfo.LoadAvg5,
|
||||
LoadAvg15: sysInfo.LoadAvg15,
|
||||
}
|
||||
|
||||
// Storage
|
||||
r.Storage = []StorageReport{
|
||||
{Mount: "/", TotalGB: sysInfo.DiskTotalGB, UsedGB: sysInfo.DiskUsedGB, Percent: sysInfo.DiskPercent},
|
||||
}
|
||||
if sysInfo.HDDConfigured {
|
||||
r.Storage = append(r.Storage, StorageReport{
|
||||
Mount: cfg.Paths.HDDPath,
|
||||
TotalGB: sysInfo.HDDTotalGB,
|
||||
UsedGB: sysInfo.HDDUsedGB,
|
||||
Percent: sysInfo.HDDPercent,
|
||||
})
|
||||
}
|
||||
|
||||
// Containers
|
||||
r.Containers = buildContainerReport(stackMgr, metricsStore)
|
||||
|
||||
// Backup
|
||||
r.Backup = buildBackupReport(cfg, backupMgr)
|
||||
|
||||
// Health
|
||||
healthReport := monitor.RunHealthCheck(cfg, cpuCollector)
|
||||
r.Health = HealthReport{
|
||||
Status: healthReport.Status,
|
||||
Issues: healthReport.Issues,
|
||||
Warnings: healthReport.Warnings,
|
||||
}
|
||||
if r.Health.Issues == nil {
|
||||
r.Health.Issues = []string{}
|
||||
}
|
||||
if r.Health.Warnings == nil {
|
||||
r.Health.Warnings = []string{}
|
||||
}
|
||||
|
||||
// Stacks
|
||||
r.Stacks = buildStacksReport(stackMgr)
|
||||
|
||||
return r
|
||||
}
|
||||
|
||||
func buildContainerReport(stackMgr *stacks.Manager, metricsStore *metrics.MetricsStore) ContainerReport {
|
||||
cr := ContainerReport{}
|
||||
|
||||
allStacks := stackMgr.GetStacks()
|
||||
|
||||
// Build a map of container stats from metrics store
|
||||
statsMap := make(map[string]metrics.ContainerCurrentStats)
|
||||
if metricsStore != nil {
|
||||
if stats, err := metricsStore.QueryContainerSummary(); err == nil {
|
||||
for _, s := range stats {
|
||||
statsMap[s.ContainerName] = s
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, s := range allStacks {
|
||||
if !s.Deployed {
|
||||
continue
|
||||
}
|
||||
for _, c := range s.Containers {
|
||||
cr.Total++
|
||||
switch c.State {
|
||||
case stacks.StateRunning, stacks.StateStarting:
|
||||
cr.Running++
|
||||
case stacks.StateUnhealthy:
|
||||
cr.Unhealthy++
|
||||
cr.Running++ // unhealthy containers are still running
|
||||
default:
|
||||
cr.Stopped++
|
||||
}
|
||||
|
||||
detail := ContainerDetailReport{
|
||||
Name: c.Name,
|
||||
State: string(c.State),
|
||||
}
|
||||
if cs, ok := statsMap[c.Name]; ok {
|
||||
detail.CPUPercent = cs.CPUPercent
|
||||
detail.MemoryMB = cs.MemUsageMB
|
||||
}
|
||||
cr.List = append(cr.List, detail)
|
||||
}
|
||||
}
|
||||
|
||||
if cr.List == nil {
|
||||
cr.List = []ContainerDetailReport{}
|
||||
}
|
||||
|
||||
return cr
|
||||
}
|
||||
|
||||
func buildBackupReport(cfg *config.Config, backupMgr *backup.Manager) BackupReport {
|
||||
br := BackupReport{
|
||||
Enabled: cfg.Backup.Enabled,
|
||||
}
|
||||
|
||||
if backupMgr == nil {
|
||||
return br
|
||||
}
|
||||
|
||||
nextDBDump := scheduler.NextDailyRun(cfg.Backup.DBDumpSchedule)
|
||||
nextBackup := scheduler.NextDailyRun(cfg.Backup.ResticSchedule)
|
||||
status := backupMgr.GetFullStatus(nextDBDump, nextBackup)
|
||||
|
||||
if status.LastDBDump != nil {
|
||||
t := status.LastDBDump.LastRun
|
||||
br.LastDBDump = &t
|
||||
}
|
||||
if status.LastBackup != nil {
|
||||
t := status.LastBackup.LastRun
|
||||
br.LastSnapshot = &t
|
||||
}
|
||||
if status.RepoStats != nil {
|
||||
br.SnapshotCount = status.RepoStats.SnapshotCount
|
||||
br.RepoSizeMB = parseSizeToMB(status.RepoStats.TotalSize)
|
||||
}
|
||||
if !status.LastCheckTime.IsZero() {
|
||||
t := status.LastCheckTime
|
||||
br.LastIntegrityCheck = &t
|
||||
}
|
||||
br.IntegrityOK = status.LastCheckOK
|
||||
|
||||
return br
|
||||
}
|
||||
|
||||
func buildStacksReport(stackMgr *stacks.Manager) StacksReport {
|
||||
sr := StacksReport{}
|
||||
allStacks := stackMgr.GetStacks()
|
||||
|
||||
for _, s := range allStacks {
|
||||
if s.Protected {
|
||||
continue
|
||||
}
|
||||
if s.Deployed {
|
||||
sr.Deployed = append(sr.Deployed, s.Name)
|
||||
} else {
|
||||
sr.Available = append(sr.Available, s.Name)
|
||||
}
|
||||
}
|
||||
|
||||
if sr.Deployed == nil {
|
||||
sr.Deployed = []string{}
|
||||
}
|
||||
if sr.Available == nil {
|
||||
sr.Available = []string{}
|
||||
}
|
||||
|
||||
return sr
|
||||
}
|
||||
|
||||
// parseSizeToMB parses a formatted size string like "1.5 GB", "512.0 MB" into MB.
|
||||
func parseSizeToMB(s string) int64 {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" {
|
||||
return 0
|
||||
}
|
||||
|
||||
parts := strings.Fields(s)
|
||||
if len(parts) != 2 {
|
||||
return 0
|
||||
}
|
||||
|
||||
val, err := strconv.ParseFloat(parts[0], 64)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
switch strings.ToUpper(parts[1]) {
|
||||
case "GB":
|
||||
return int64(val * 1024)
|
||||
case "MB":
|
||||
return int64(val)
|
||||
case "KB":
|
||||
return int64(val / 1024)
|
||||
default:
|
||||
return int64(val)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
package report
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||||
)
|
||||
|
||||
// Pusher sends reports to the central hub.
|
||||
type Pusher struct {
|
||||
hubURL string
|
||||
apiKey string
|
||||
httpClient *http.Client
|
||||
logger *log.Logger
|
||||
enabled bool
|
||||
}
|
||||
|
||||
// NewPusher creates a new report pusher from hub configuration.
|
||||
func NewPusher(cfg *config.HubConfig, logger *log.Logger) *Pusher {
|
||||
return &Pusher{
|
||||
hubURL: strings.TrimRight(cfg.URL, "/"),
|
||||
apiKey: cfg.APIKey,
|
||||
httpClient: &http.Client{
|
||||
Timeout: 30 * time.Second,
|
||||
},
|
||||
logger: logger,
|
||||
enabled: cfg.Enabled,
|
||||
}
|
||||
}
|
||||
|
||||
// Push sends a report to the hub. Retries 3 times with 5s backoff.
|
||||
// Never returns error to caller — push failures should not affect controller operation.
|
||||
func (p *Pusher) Push(report *Report) error {
|
||||
if !p.enabled {
|
||||
return nil
|
||||
}
|
||||
|
||||
data, err := json.Marshal(report)
|
||||
if err != nil {
|
||||
p.logger.Printf("[WARN] Hub report marshal failed: %v", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
url := p.hubURL + "/api/v1/report"
|
||||
|
||||
var lastErr error
|
||||
for attempt := 0; attempt < 3; attempt++ {
|
||||
if attempt > 0 {
|
||||
time.Sleep(5 * time.Second)
|
||||
}
|
||||
|
||||
req, err := http.NewRequest(http.MethodPost, url, bytes.NewReader(data))
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
if p.apiKey != "" {
|
||||
req.Header.Set("Authorization", "Bearer "+p.apiKey)
|
||||
}
|
||||
|
||||
resp, err := p.httpClient.Do(req)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue
|
||||
}
|
||||
io.Copy(io.Discard, resp.Body)
|
||||
resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 200 && resp.StatusCode < 300 {
|
||||
p.logger.Printf("[INFO] Hub report pushed successfully (%d bytes)", len(data))
|
||||
return nil
|
||||
}
|
||||
lastErr = fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
p.logger.Printf("[WARN] Hub report push failed after 3 attempts: %v", lastErr)
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,85 @@
|
||||
package report
|
||||
|
||||
import "time"
|
||||
|
||||
// Report is the JSON payload pushed to the central hub.
|
||||
type Report struct {
|
||||
Version int `json:"version"`
|
||||
CustomerID string `json:"customer_id"`
|
||||
CustomerName string `json:"customer_name"`
|
||||
ControllerVersion string `json:"controller_version"`
|
||||
Timestamp time.Time `json:"timestamp"`
|
||||
System SystemReport `json:"system"`
|
||||
Storage []StorageReport `json:"storage"`
|
||||
Containers ContainerReport `json:"containers"`
|
||||
Backup BackupReport `json:"backup"`
|
||||
Health HealthReport `json:"health"`
|
||||
Stacks StacksReport `json:"stacks"`
|
||||
}
|
||||
|
||||
// SystemReport holds host-level system info.
|
||||
type SystemReport struct {
|
||||
Hostname string `json:"hostname"`
|
||||
OS string `json:"os"`
|
||||
Kernel string `json:"kernel"`
|
||||
CPUModel string `json:"cpu_model"`
|
||||
CPUCores int `json:"cpu_cores"`
|
||||
UptimeSeconds int64 `json:"uptime_seconds"`
|
||||
CPUPercent float64 `json:"cpu_percent"`
|
||||
MemoryTotalMB uint64 `json:"memory_total_mb"`
|
||||
MemoryUsedMB uint64 `json:"memory_used_mb"`
|
||||
MemoryPercent float64 `json:"memory_percent"`
|
||||
TemperatureCelsius float64 `json:"temperature_celsius"`
|
||||
LoadAvg1 float64 `json:"load_avg_1"`
|
||||
LoadAvg5 float64 `json:"load_avg_5"`
|
||||
LoadAvg15 float64 `json:"load_avg_15"`
|
||||
}
|
||||
|
||||
// StorageReport holds disk usage for a mount point.
|
||||
type StorageReport struct {
|
||||
Mount string `json:"mount"`
|
||||
TotalGB float64 `json:"total_gb"`
|
||||
UsedGB float64 `json:"used_gb"`
|
||||
Percent float64 `json:"percent"`
|
||||
}
|
||||
|
||||
// ContainerReport holds aggregate and per-container status.
|
||||
type ContainerReport struct {
|
||||
Total int `json:"total"`
|
||||
Running int `json:"running"`
|
||||
Stopped int `json:"stopped"`
|
||||
Unhealthy int `json:"unhealthy"`
|
||||
List []ContainerDetailReport `json:"list"`
|
||||
}
|
||||
|
||||
// ContainerDetailReport holds per-container info.
|
||||
type ContainerDetailReport struct {
|
||||
Name string `json:"name"`
|
||||
State string `json:"state"`
|
||||
CPUPercent float64 `json:"cpu_percent"`
|
||||
MemoryMB float64 `json:"memory_mb"`
|
||||
}
|
||||
|
||||
// BackupReport holds backup subsystem status.
|
||||
type BackupReport struct {
|
||||
Enabled bool `json:"enabled"`
|
||||
LastDBDump *time.Time `json:"last_db_dump,omitempty"`
|
||||
LastSnapshot *time.Time `json:"last_snapshot,omitempty"`
|
||||
SnapshotCount int `json:"snapshot_count"`
|
||||
RepoSizeMB int64 `json:"repo_size_mb"`
|
||||
LastIntegrityCheck *time.Time `json:"last_integrity_check,omitempty"`
|
||||
IntegrityOK bool `json:"integrity_ok"`
|
||||
}
|
||||
|
||||
// HealthReport holds the aggregated health status.
|
||||
type HealthReport struct {
|
||||
Status string `json:"status"`
|
||||
Issues []string `json:"issues"`
|
||||
Warnings []string `json:"warnings"`
|
||||
}
|
||||
|
||||
// StacksReport holds stack deployment status.
|
||||
type StacksReport struct {
|
||||
Deployed []string `json:"deployed"`
|
||||
Available []string `json:"available"`
|
||||
}
|
||||
Reference in New Issue
Block a user