feat: include controller in app telemetry reports
Add the felhom-controller container as a special entry in the app_telemetry array sent to the hub. This reuses all existing hub infrastructure (storage, aggregation, UI) with zero hub-side changes. The controller's memory/CPU metrics and log warnings/errors are now collected alongside app telemetry, giving the hub visibility into controller health, memory trends, and known issues. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -9,8 +9,12 @@ import (
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/stacks"
|
||||
)
|
||||
|
||||
// controllerContainerName is the Docker container name for the controller itself.
|
||||
const controllerContainerName = "felhom-controller"
|
||||
|
||||
// buildAppTelemetrySection collects metrics telemetry and log scans for all
|
||||
// non-protected, deployed stacks and returns per-app telemetry data.
|
||||
// non-protected, deployed stacks plus the controller itself, and returns
|
||||
// per-app telemetry data.
|
||||
func buildAppTelemetrySection(stackMgr *stacks.Manager, metricsStore *metrics.MetricsStore, logger *log.Logger) []AppTelemetry {
|
||||
allStacks := stackMgr.GetStacks()
|
||||
|
||||
@@ -35,11 +39,27 @@ func buildAppTelemetrySection(stackMgr *stacks.Manager, metricsStore *metrics.Me
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Scan logs
|
||||
// 3. Include controller container in log scan
|
||||
containerNames = append(containerNames, controllerContainerName)
|
||||
|
||||
// 4. Scan logs (includes controller)
|
||||
logs := metrics.ScanContainerLogs(containerNames, 15*time.Minute, logger)
|
||||
|
||||
// 4. Build per-app telemetry
|
||||
return buildAppTelemetry(allStacks, telemetry, logs)
|
||||
// 5. Build per-app telemetry (stacks only)
|
||||
result := buildAppTelemetry(allStacks, telemetry, logs)
|
||||
|
||||
// 6. Append controller telemetry entry
|
||||
if ctrl := buildControllerTelemetry(telemetry, logs); ctrl != nil {
|
||||
result = append(result, *ctrl)
|
||||
sort.Slice(result, func(i, j int) bool {
|
||||
return result[i].AppName < result[j].AppName
|
||||
})
|
||||
}
|
||||
|
||||
if result == nil {
|
||||
result = []AppTelemetry{}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// buildAppTelemetry aggregates container-level telemetry and log data into per-stack AppTelemetry entries.
|
||||
@@ -118,6 +138,48 @@ func buildAppTelemetry(allStacks []stacks.Stack, telemetry []metrics.ContainerTe
|
||||
return result
|
||||
}
|
||||
|
||||
// buildControllerTelemetry creates a telemetry entry for the controller container.
|
||||
// Returns nil if no metrics or log data is available.
|
||||
func buildControllerTelemetry(telemetry []metrics.ContainerTelemetry, logs []metrics.ContainerLogSummary) *AppTelemetry {
|
||||
app := AppTelemetry{
|
||||
AppName: controllerContainerName,
|
||||
DisplayName: "Felhom Controller",
|
||||
Containers: []string{controllerContainerName},
|
||||
}
|
||||
|
||||
// Find metrics for the controller container
|
||||
for _, ct := range telemetry {
|
||||
if ct.ContainerName == controllerContainerName {
|
||||
app.MemoryCurrentMB = ct.MemoryCurrentMB
|
||||
app.MemoryAvgMB = ct.MemoryAvgMB
|
||||
app.MemoryPeakMB = ct.MemoryPeakMB
|
||||
app.CPUAvgPercent = ct.CPUAvgPercent
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Find log scan results for the controller container
|
||||
for _, ls := range logs {
|
||||
if ls.ContainerName == controllerContainerName {
|
||||
app.LogErrors = ls.ErrorCount
|
||||
app.LogWarnings = ls.WarnCount
|
||||
issues := ls.RecentIssues
|
||||
if len(issues) > 10 {
|
||||
issues = issues[:10]
|
||||
}
|
||||
app.Issues = issues
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Only include if we have at least metrics or log data
|
||||
if app.MemoryCurrentMB == 0 && app.MemoryAvgMB == 0 && app.LogErrors == 0 && app.LogWarnings == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &app
|
||||
}
|
||||
|
||||
// isStackRunning returns true if the stack has containers actively running
|
||||
// (running, starting, or unhealthy but still up). Stopped, exited, deploying
|
||||
// etc. are excluded to avoid sending zero-value telemetry to the hub.
|
||||
|
||||
Reference in New Issue
Block a user