feat(telemetry): add per-app metrics and log telemetry to hub reports (v0.28.0)
- New internal/metrics/telemetry.go: MetricsStore.GetContainerTelemetry() aggregates container memory/CPU from SQLite over the last 15 min - New internal/metrics/logscanner.go: ScanContainerLogs() scans docker logs for errors/warnings, deduplicates via fingerprinting (strips timestamps, replaces 6+ digit numbers, hex strings, UUIDs) - New internal/report/telemetry.go: buildAppTelemetrySection() assembles per-stack AppTelemetry by aggregating container metrics and log summaries - internal/report/types.go: added AppTelemetry field to Report struct plus AppTelemetry type with memory/CPU/log fields and LogIssue references - internal/report/builder.go: calls buildAppTelemetrySection() in BuildReport() - Backward-compatible: old Hub versions silently ignore app_telemetry field Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -150,9 +150,12 @@ func BuildReport(
|
||||
// Stacks
|
||||
r.Stacks = buildStacksReport(stackMgr)
|
||||
|
||||
// App telemetry (metrics + log scan)
|
||||
r.AppTelemetry = buildAppTelemetrySection(stackMgr, metricsStore, logger)
|
||||
|
||||
if debug && logger != nil {
|
||||
logger.Printf("[DEBUG] BuildReport: complete — containers=%d, health=%s, deployed=%d, available=%d",
|
||||
r.Containers.Total, r.Health.Status, len(r.Stacks.Deployed), len(r.Stacks.Available))
|
||||
logger.Printf("[DEBUG] BuildReport: complete — containers=%d, health=%s, deployed=%d, available=%d, app_telemetry=%d",
|
||||
r.Containers.Total, r.Health.Status, len(r.Stacks.Deployed), len(r.Stacks.Available), len(r.AppTelemetry))
|
||||
}
|
||||
|
||||
return r
|
||||
|
||||
@@ -0,0 +1,119 @@
|
||||
package report
|
||||
|
||||
import (
|
||||
"log"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/metrics"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/stacks"
|
||||
)
|
||||
|
||||
// buildAppTelemetrySection collects metrics telemetry and log scans for all
|
||||
// non-protected, deployed stacks and returns per-app telemetry data.
|
||||
func buildAppTelemetrySection(stackMgr *stacks.Manager, metricsStore *metrics.MetricsStore, logger *log.Logger) []AppTelemetry {
|
||||
allStacks := stackMgr.GetStacks()
|
||||
|
||||
// 1. Get metrics telemetry (last 15 minutes)
|
||||
var telemetry []metrics.ContainerTelemetry
|
||||
if metricsStore != nil {
|
||||
var err error
|
||||
telemetry, err = metricsStore.GetContainerTelemetry(time.Now().Add(-15 * time.Minute))
|
||||
if err != nil && logger != nil {
|
||||
logger.Printf("[WARN] Telemetry metrics query failed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Collect non-protected container names for log scan
|
||||
var containerNames []string
|
||||
for _, s := range allStacks {
|
||||
if s.Protected || !s.Deployed {
|
||||
continue
|
||||
}
|
||||
for _, c := range s.Containers {
|
||||
containerNames = append(containerNames, c.Name)
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Scan logs
|
||||
logs := metrics.ScanContainerLogs(containerNames, 15*time.Minute, logger)
|
||||
|
||||
// 4. Build per-app telemetry
|
||||
return buildAppTelemetry(allStacks, telemetry, logs)
|
||||
}
|
||||
|
||||
// buildAppTelemetry aggregates container-level telemetry and log data into per-stack AppTelemetry entries.
|
||||
func buildAppTelemetry(allStacks []stacks.Stack, telemetry []metrics.ContainerTelemetry, logs []metrics.ContainerLogSummary) []AppTelemetry {
|
||||
// Build lookup maps
|
||||
telemetryMap := make(map[string]metrics.ContainerTelemetry, len(telemetry))
|
||||
for _, ct := range telemetry {
|
||||
telemetryMap[ct.ContainerName] = ct
|
||||
}
|
||||
|
||||
logMap := make(map[string]metrics.ContainerLogSummary, len(logs))
|
||||
for _, ls := range logs {
|
||||
logMap[ls.ContainerName] = ls
|
||||
}
|
||||
|
||||
var result []AppTelemetry
|
||||
|
||||
for _, s := range allStacks {
|
||||
if s.Protected || !s.Deployed {
|
||||
continue
|
||||
}
|
||||
|
||||
app := AppTelemetry{
|
||||
AppName: s.Name,
|
||||
DisplayName: s.Meta.DisplayName,
|
||||
CatalogEstimate: s.Meta.Resources.MemRequest,
|
||||
CatalogLimit: s.Meta.Resources.MemLimit,
|
||||
}
|
||||
|
||||
var mergedIssues []metrics.LogIssue
|
||||
|
||||
for _, c := range s.Containers {
|
||||
app.Containers = append(app.Containers, c.Name)
|
||||
|
||||
if ct, ok := telemetryMap[c.Name]; ok {
|
||||
app.MemoryCurrentMB += ct.MemoryCurrentMB
|
||||
app.MemoryAvgMB += ct.MemoryAvgMB
|
||||
app.MemoryPeakMB += ct.MemoryPeakMB
|
||||
app.CPUAvgPercent += ct.CPUAvgPercent
|
||||
}
|
||||
|
||||
if ls, ok := logMap[c.Name]; ok {
|
||||
app.LogErrors += ls.ErrorCount
|
||||
app.LogWarnings += ls.WarnCount
|
||||
mergedIssues = append(mergedIssues, ls.RecentIssues...)
|
||||
}
|
||||
}
|
||||
|
||||
if app.Containers == nil {
|
||||
app.Containers = []string{}
|
||||
}
|
||||
|
||||
// Sort and cap merged issues at 10
|
||||
sort.Slice(mergedIssues, func(i, j int) bool {
|
||||
if mergedIssues[i].Count != mergedIssues[j].Count {
|
||||
return mergedIssues[i].Count > mergedIssues[j].Count
|
||||
}
|
||||
return mergedIssues[i].LastSeen.After(mergedIssues[j].LastSeen)
|
||||
})
|
||||
if len(mergedIssues) > 10 {
|
||||
mergedIssues = mergedIssues[:10]
|
||||
}
|
||||
app.Issues = mergedIssues
|
||||
|
||||
result = append(result, app)
|
||||
}
|
||||
|
||||
// Sort by app name
|
||||
sort.Slice(result, func(i, j int) bool {
|
||||
return result[i].AppName < result[j].AppName
|
||||
})
|
||||
|
||||
if result == nil {
|
||||
result = []AppTelemetry{}
|
||||
}
|
||||
return result
|
||||
}
|
||||
@@ -1,6 +1,10 @@
|
||||
package report
|
||||
|
||||
import "time"
|
||||
import (
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/metrics"
|
||||
)
|
||||
|
||||
// Report is the JSON payload pushed to the central hub.
|
||||
type Report struct {
|
||||
@@ -18,6 +22,7 @@ type Report struct {
|
||||
Backup BackupReport `json:"backup"`
|
||||
Health HealthReport `json:"health"`
|
||||
Stacks StacksReport `json:"stacks"`
|
||||
AppTelemetry []AppTelemetry `json:"app_telemetry,omitempty"`
|
||||
}
|
||||
|
||||
// SystemReport holds host-level system info.
|
||||
@@ -91,3 +96,19 @@ type StacksReport struct {
|
||||
Deployed []string `json:"deployed"`
|
||||
Available []string `json:"available"`
|
||||
}
|
||||
|
||||
// AppTelemetry holds per-app (per-stack) resource and log telemetry.
|
||||
type AppTelemetry struct {
|
||||
AppName string `json:"app_name"`
|
||||
DisplayName string `json:"display_name"`
|
||||
Containers []string `json:"containers"`
|
||||
MemoryCurrentMB float64 `json:"memory_current_mb"`
|
||||
MemoryAvgMB float64 `json:"memory_avg_mb"`
|
||||
MemoryPeakMB float64 `json:"memory_peak_mb"`
|
||||
CPUAvgPercent float64 `json:"cpu_avg_percent"`
|
||||
CatalogEstimate string `json:"catalog_estimate"`
|
||||
CatalogLimit string `json:"catalog_limit"`
|
||||
LogErrors int `json:"log_errors"`
|
||||
LogWarnings int `json:"log_warnings"`
|
||||
Issues []metrics.LogIssue `json:"issues,omitempty"`
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user