62d26be8ae
Add the felhom-controller container as a special entry in the app_telemetry array sent to the hub. This reuses all existing hub infrastructure (storage, aggregation, UI) with zero hub-side changes. The controller's memory/CPU metrics and log warnings/errors are now collected alongside app telemetry, giving the hub visibility into controller health, memory trends, and known issues. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
201 lines
5.9 KiB
Go
201 lines
5.9 KiB
Go
package report
|
|
|
|
import (
|
|
"log"
|
|
"sort"
|
|
"time"
|
|
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/metrics"
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/stacks"
|
|
)
|
|
|
|
// controllerContainerName is the Docker container name for the controller itself.
|
|
const controllerContainerName = "felhom-controller"
|
|
|
|
// buildAppTelemetrySection collects metrics telemetry and log scans for all
|
|
// non-protected, deployed stacks plus the controller itself, and returns
|
|
// per-app telemetry data.
|
|
func buildAppTelemetrySection(stackMgr *stacks.Manager, metricsStore *metrics.MetricsStore, logger *log.Logger) []AppTelemetry {
|
|
allStacks := stackMgr.GetStacks()
|
|
|
|
// 1. Get metrics telemetry (last 15 minutes)
|
|
var telemetry []metrics.ContainerTelemetry
|
|
if metricsStore != nil {
|
|
var err error
|
|
telemetry, err = metricsStore.GetContainerTelemetry(time.Now().Add(-15 * time.Minute))
|
|
if err != nil && logger != nil {
|
|
logger.Printf("[WARN] Telemetry metrics query failed: %v", err)
|
|
}
|
|
}
|
|
|
|
// 2. Collect non-protected container names for log scan
|
|
var containerNames []string
|
|
for _, s := range allStacks {
|
|
if s.Protected || !s.Deployed || !isStackRunning(s.State) {
|
|
continue
|
|
}
|
|
for _, c := range s.Containers {
|
|
containerNames = append(containerNames, c.Name)
|
|
}
|
|
}
|
|
|
|
// 3. Include controller container in log scan
|
|
containerNames = append(containerNames, controllerContainerName)
|
|
|
|
// 4. Scan logs (includes controller)
|
|
logs := metrics.ScanContainerLogs(containerNames, 15*time.Minute, logger)
|
|
|
|
// 5. Build per-app telemetry (stacks only)
|
|
result := buildAppTelemetry(allStacks, telemetry, logs)
|
|
|
|
// 6. Append controller telemetry entry
|
|
if ctrl := buildControllerTelemetry(telemetry, logs); ctrl != nil {
|
|
result = append(result, *ctrl)
|
|
sort.Slice(result, func(i, j int) bool {
|
|
return result[i].AppName < result[j].AppName
|
|
})
|
|
}
|
|
|
|
if result == nil {
|
|
result = []AppTelemetry{}
|
|
}
|
|
return result
|
|
}
|
|
|
|
// buildAppTelemetry aggregates container-level telemetry and log data into per-stack AppTelemetry entries.
|
|
func buildAppTelemetry(allStacks []stacks.Stack, telemetry []metrics.ContainerTelemetry, logs []metrics.ContainerLogSummary) []AppTelemetry {
|
|
// Build lookup maps
|
|
telemetryMap := make(map[string]metrics.ContainerTelemetry, len(telemetry))
|
|
for _, ct := range telemetry {
|
|
telemetryMap[ct.ContainerName] = ct
|
|
}
|
|
|
|
logMap := make(map[string]metrics.ContainerLogSummary, len(logs))
|
|
for _, ls := range logs {
|
|
logMap[ls.ContainerName] = ls
|
|
}
|
|
|
|
var result []AppTelemetry
|
|
|
|
for _, s := range allStacks {
|
|
if s.Protected || !s.Deployed || !isStackRunning(s.State) {
|
|
continue
|
|
}
|
|
|
|
app := AppTelemetry{
|
|
AppName: s.Name,
|
|
DisplayName: s.Meta.DisplayName,
|
|
CatalogEstimate: s.Meta.Resources.MemRequest,
|
|
CatalogLimit: s.Meta.Resources.MemLimit,
|
|
}
|
|
|
|
var mergedIssues []metrics.LogIssue
|
|
|
|
for _, c := range s.Containers {
|
|
app.Containers = append(app.Containers, c.Name)
|
|
|
|
if ct, ok := telemetryMap[c.Name]; ok {
|
|
app.MemoryCurrentMB += ct.MemoryCurrentMB
|
|
app.MemoryAvgMB += ct.MemoryAvgMB
|
|
app.MemoryPeakMB += ct.MemoryPeakMB
|
|
app.CPUAvgPercent += ct.CPUAvgPercent
|
|
}
|
|
|
|
if ls, ok := logMap[c.Name]; ok {
|
|
app.LogErrors += ls.ErrorCount
|
|
app.LogWarnings += ls.WarnCount
|
|
mergedIssues = append(mergedIssues, ls.RecentIssues...)
|
|
}
|
|
}
|
|
|
|
if app.Containers == nil {
|
|
app.Containers = []string{}
|
|
}
|
|
|
|
// Sort and cap merged issues at 10
|
|
sort.Slice(mergedIssues, func(i, j int) bool {
|
|
if mergedIssues[i].Count != mergedIssues[j].Count {
|
|
return mergedIssues[i].Count > mergedIssues[j].Count
|
|
}
|
|
return mergedIssues[i].LastSeen.After(mergedIssues[j].LastSeen)
|
|
})
|
|
if len(mergedIssues) > 10 {
|
|
mergedIssues = mergedIssues[:10]
|
|
}
|
|
app.Issues = mergedIssues
|
|
|
|
result = append(result, app)
|
|
}
|
|
|
|
// Sort by app name
|
|
sort.Slice(result, func(i, j int) bool {
|
|
return result[i].AppName < result[j].AppName
|
|
})
|
|
|
|
if result == nil {
|
|
result = []AppTelemetry{}
|
|
}
|
|
return result
|
|
}
|
|
|
|
// buildControllerTelemetry creates a telemetry entry for the controller container.
|
|
// Returns nil if no metrics or log data is available.
|
|
func buildControllerTelemetry(telemetry []metrics.ContainerTelemetry, logs []metrics.ContainerLogSummary) *AppTelemetry {
|
|
app := AppTelemetry{
|
|
AppName: controllerContainerName,
|
|
DisplayName: "Felhom Controller",
|
|
Containers: []string{controllerContainerName},
|
|
}
|
|
|
|
// Find metrics for the controller container
|
|
for _, ct := range telemetry {
|
|
if ct.ContainerName == controllerContainerName {
|
|
app.MemoryCurrentMB = ct.MemoryCurrentMB
|
|
app.MemoryAvgMB = ct.MemoryAvgMB
|
|
app.MemoryPeakMB = ct.MemoryPeakMB
|
|
app.CPUAvgPercent = ct.CPUAvgPercent
|
|
break
|
|
}
|
|
}
|
|
|
|
// Find log scan results for the controller container
|
|
for _, ls := range logs {
|
|
if ls.ContainerName == controllerContainerName {
|
|
app.LogErrors = ls.ErrorCount
|
|
app.LogWarnings = ls.WarnCount
|
|
issues := ls.RecentIssues
|
|
if len(issues) > 10 {
|
|
issues = issues[:10]
|
|
}
|
|
app.Issues = issues
|
|
break
|
|
}
|
|
}
|
|
|
|
// Only include if we have at least metrics or log data
|
|
if app.MemoryCurrentMB == 0 && app.MemoryAvgMB == 0 && app.LogErrors == 0 && app.LogWarnings == 0 {
|
|
return nil
|
|
}
|
|
|
|
return &app
|
|
}
|
|
|
|
// isStackRunning returns true if the stack has containers actively running
|
|
// (running, starting, or unhealthy but still up). Stopped, exited, deploying
|
|
// etc. are excluded to avoid sending zero-value telemetry to the hub.
|
|
func isStackRunning(state stacks.ContainerState) bool {
|
|
switch state {
|
|
case stacks.StateRunning, stacks.StateStarting, stacks.StateUnhealthy, stacks.StateRestarting:
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// BuildAppTelemetryForDebug runs the full telemetry collection pipeline
|
|
// (metrics query + log scan) and returns per-app telemetry data.
|
|
// Used by the debug endpoint to preview telemetry without pushing to hub.
|
|
func BuildAppTelemetryForDebug(stackMgr *stacks.Manager, metricsStore *metrics.MetricsStore, logger *log.Logger) []AppTelemetry {
|
|
return buildAppTelemetrySection(stackMgr, metricsStore, logger)
|
|
}
|