Files
deploy-felhom-compose/controller/internal/report/telemetry.go
T
admin e737704e68 fix: skip stopped apps in telemetry to avoid zero-value averages on hub
Deployed-but-stopped apps were included in telemetry reports with all-zero
memory/CPU values, dragging down hub-side averages. Now isStackRunning()
filters to only running/starting/unhealthy/restarting states.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 15:05:39 +01:00

139 lines
4.1 KiB
Go

package report
import (
"log"
"sort"
"time"
"gitea.dooplex.hu/admin/felhom-controller/internal/metrics"
"gitea.dooplex.hu/admin/felhom-controller/internal/stacks"
)
// buildAppTelemetrySection collects metrics telemetry and log scans for all
// non-protected, deployed stacks and returns per-app telemetry data.
func buildAppTelemetrySection(stackMgr *stacks.Manager, metricsStore *metrics.MetricsStore, logger *log.Logger) []AppTelemetry {
allStacks := stackMgr.GetStacks()
// 1. Get metrics telemetry (last 15 minutes)
var telemetry []metrics.ContainerTelemetry
if metricsStore != nil {
var err error
telemetry, err = metricsStore.GetContainerTelemetry(time.Now().Add(-15 * time.Minute))
if err != nil && logger != nil {
logger.Printf("[WARN] Telemetry metrics query failed: %v", err)
}
}
// 2. Collect non-protected container names for log scan
var containerNames []string
for _, s := range allStacks {
if s.Protected || !s.Deployed || !isStackRunning(s.State) {
continue
}
for _, c := range s.Containers {
containerNames = append(containerNames, c.Name)
}
}
// 3. Scan logs
logs := metrics.ScanContainerLogs(containerNames, 15*time.Minute, logger)
// 4. Build per-app telemetry
return buildAppTelemetry(allStacks, telemetry, logs)
}
// buildAppTelemetry aggregates container-level telemetry and log data into per-stack AppTelemetry entries.
func buildAppTelemetry(allStacks []stacks.Stack, telemetry []metrics.ContainerTelemetry, logs []metrics.ContainerLogSummary) []AppTelemetry {
// Build lookup maps
telemetryMap := make(map[string]metrics.ContainerTelemetry, len(telemetry))
for _, ct := range telemetry {
telemetryMap[ct.ContainerName] = ct
}
logMap := make(map[string]metrics.ContainerLogSummary, len(logs))
for _, ls := range logs {
logMap[ls.ContainerName] = ls
}
var result []AppTelemetry
for _, s := range allStacks {
if s.Protected || !s.Deployed || !isStackRunning(s.State) {
continue
}
app := AppTelemetry{
AppName: s.Name,
DisplayName: s.Meta.DisplayName,
CatalogEstimate: s.Meta.Resources.MemRequest,
CatalogLimit: s.Meta.Resources.MemLimit,
}
var mergedIssues []metrics.LogIssue
for _, c := range s.Containers {
app.Containers = append(app.Containers, c.Name)
if ct, ok := telemetryMap[c.Name]; ok {
app.MemoryCurrentMB += ct.MemoryCurrentMB
app.MemoryAvgMB += ct.MemoryAvgMB
app.MemoryPeakMB += ct.MemoryPeakMB
app.CPUAvgPercent += ct.CPUAvgPercent
}
if ls, ok := logMap[c.Name]; ok {
app.LogErrors += ls.ErrorCount
app.LogWarnings += ls.WarnCount
mergedIssues = append(mergedIssues, ls.RecentIssues...)
}
}
if app.Containers == nil {
app.Containers = []string{}
}
// Sort and cap merged issues at 10
sort.Slice(mergedIssues, func(i, j int) bool {
if mergedIssues[i].Count != mergedIssues[j].Count {
return mergedIssues[i].Count > mergedIssues[j].Count
}
return mergedIssues[i].LastSeen.After(mergedIssues[j].LastSeen)
})
if len(mergedIssues) > 10 {
mergedIssues = mergedIssues[:10]
}
app.Issues = mergedIssues
result = append(result, app)
}
// Sort by app name
sort.Slice(result, func(i, j int) bool {
return result[i].AppName < result[j].AppName
})
if result == nil {
result = []AppTelemetry{}
}
return result
}
// isStackRunning returns true if the stack has containers actively running
// (running, starting, or unhealthy but still up). Stopped, exited, deploying
// etc. are excluded to avoid sending zero-value telemetry to the hub.
func isStackRunning(state stacks.ContainerState) bool {
switch state {
case stacks.StateRunning, stacks.StateStarting, stacks.StateUnhealthy, stacks.StateRestarting:
return true
default:
return false
}
}
// BuildAppTelemetryForDebug runs the full telemetry collection pipeline
// (metrics query + log scan) and returns per-app telemetry data.
// Used by the debug endpoint to preview telemetry without pushing to hub.
func BuildAppTelemetryForDebug(stackMgr *stacks.Manager, metricsStore *metrics.MetricsStore, logger *log.Logger) []AppTelemetry {
return buildAppTelemetrySection(stackMgr, metricsStore, logger)
}