feat(telemetry): add per-app metrics and log telemetry to hub reports (v0.28.0)

- New internal/metrics/telemetry.go: MetricsStore.GetContainerTelemetry()
  aggregates container memory/CPU from SQLite over the last 15 min
- New internal/metrics/logscanner.go: ScanContainerLogs() scans docker logs
  for errors/warnings, deduplicates via fingerprinting (strips timestamps,
  replaces 6+ digit numbers, hex strings, UUIDs)
- New internal/report/telemetry.go: buildAppTelemetrySection() assembles
  per-stack AppTelemetry by aggregating container metrics and log summaries
- internal/report/types.go: added AppTelemetry field to Report struct plus
  AppTelemetry type with memory/CPU/log fields and LogIssue references
- internal/report/builder.go: calls buildAppTelemetrySection() in BuildReport()
- Backward-compatible: old Hub versions silently ignore app_telemetry field

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-23 10:46:27 +01:00
parent 981c473d57
commit 05ecd65412
7 changed files with 449 additions and 4 deletions
+206
View File
@@ -0,0 +1,206 @@
package metrics
import (
"context"
"fmt"
"log"
"os/exec"
"regexp"
"sort"
"strings"
"time"
"unicode/utf8"
)
// ContainerLogSummary holds log analysis results for one container.
type ContainerLogSummary struct {
ContainerName string `json:"container_name"`
ErrorCount int `json:"error_count"`
WarnCount int `json:"warn_count"`
RecentIssues []LogIssue `json:"recent_issues,omitempty"`
}
// LogIssue represents a deduplicated log issue.
type LogIssue struct {
Severity string `json:"severity"`
Message string `json:"message"`
Count int `json:"count"`
LastSeen time.Time `json:"last_seen"`
}
var (
// Strip leading ISO timestamp: 2006-01-02T15:04:05 or 2006/01/02 15:04:05 etc.
reTimestamp = regexp.MustCompile(`^\d{4}[-/]\d{2}[-/]\d{2}[T ]\d{2}:\d{2}:\d{2}[.\d]*[Z ]?`)
// Strip syslog-style timestamp: Jan 2 15:04:05
reSyslog = regexp.MustCompile(`^[A-Z][a-z]{2}\s+\d{1,2} \d{2}:\d{2}:\d{2} `)
// Replace 6+ digit sequences with <N> (avoids mangling 4-digit HTTP codes/ports)
reNumbers = regexp.MustCompile(`\b\d{6,}\b`)
// Replace 8+ char hex strings
reHex = regexp.MustCompile(`\b[0-9a-fA-F]{8,}\b`)
// Replace UUIDs
reUUID = regexp.MustCompile(`[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}`)
errorKeywords = []string{"error", "fatal", "panic", "crit", "oom", "killed", "exception", "traceback"}
warnKeywords = []string{"warn", "warning"}
)
// ScanContainerLogs scans docker logs for the given containers and returns a summary
// of errors/warnings found. Containers are scanned sequentially to avoid load spikes.
// The caller should filter out infrastructure/protected containers before calling this.
func ScanContainerLogs(containerNames []string, since time.Duration, logger *log.Logger) []ContainerLogSummary {
if len(containerNames) == 0 {
return []ContainerLogSummary{}
}
start := time.Now()
results := make([]ContainerLogSummary, 0, len(containerNames))
for _, name := range containerNames {
summary := scanOneContainer(name, since, logger)
results = append(results, summary)
}
elapsed := time.Since(start)
if elapsed > 5*time.Minute && logger != nil {
logger.Printf("[WARN] Log scan took %s (>5min) for %d containers", elapsed.Round(time.Second), len(containerNames))
}
return results
}
func scanOneContainer(name string, since time.Duration, logger *log.Logger) ContainerLogSummary {
summary := ContainerLogSummary{ContainerName: name}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
sinceStr := formatSinceDuration(since)
cmd := exec.CommandContext(ctx, "docker", "logs", "--since="+sinceStr, "--tail=1000", name)
output, err := cmd.CombinedOutput()
if err != nil {
if logger != nil {
logger.Printf("[DEBUG] logscanner: docker logs %s: %v", name, err)
}
return summary
}
// fingerprint → issue tracking
type issueEntry struct {
severity string
message string
count int
lastSeen time.Time
}
fingerprints := make(map[string]*issueEntry)
lines := strings.Split(string(output), "\n")
for _, line := range lines {
if !utf8.Valid([]byte(line)) {
continue
}
if len(line) > 500 {
line = line[:500]
}
if line == "" {
continue
}
severity := classifyLine(line)
if severity == "" {
continue
}
if severity == "error" {
summary.ErrorCount++
} else {
summary.WarnCount++
}
fp := fingerprint(line)
if e, ok := fingerprints[fp]; ok {
e.count++
e.lastSeen = time.Now()
} else {
// Use original line trimmed as message (strip timestamp)
msg := reTimestamp.ReplaceAllString(line, "")
msg = reSyslog.ReplaceAllString(msg, "")
msg = strings.TrimSpace(msg)
if len(msg) > 200 {
msg = msg[:200]
}
fingerprints[fp] = &issueEntry{
severity: severity,
message: msg,
count: 1,
lastSeen: time.Now(),
}
}
}
// Convert map to slice, sort by count DESC then lastSeen DESC, cap at 10
issues := make([]LogIssue, 0, len(fingerprints))
for _, e := range fingerprints {
issues = append(issues, LogIssue{
Severity: e.severity,
Message: e.message,
Count: e.count,
LastSeen: e.lastSeen,
})
}
sort.Slice(issues, func(i, j int) bool {
if issues[i].Count != issues[j].Count {
return issues[i].Count > issues[j].Count
}
return issues[i].LastSeen.After(issues[j].LastSeen)
})
if len(issues) > 10 {
issues = issues[:10]
}
summary.RecentIssues = issues
return summary
}
// classifyLine returns "error", "warn", or "" based on first 5 words of the line.
func classifyLine(line string) string {
lower := strings.ToLower(line)
words := strings.Fields(lower)
if len(words) > 5 {
words = words[:5]
}
prefix := strings.Join(words, " ")
for _, kw := range errorKeywords {
if strings.Contains(prefix, kw) {
return "error"
}
}
for _, kw := range warnKeywords {
if strings.Contains(prefix, kw) {
return "warn"
}
}
return ""
}
// fingerprint produces a deduplication key for a log line.
func fingerprint(line string) string {
// Strip leading timestamp
s := reTimestamp.ReplaceAllString(line, "")
s = reSyslog.ReplaceAllString(s, "")
// Replace UUIDs before hex to avoid partial matches
s = reUUID.ReplaceAllString(s, "<UUID>")
s = reHex.ReplaceAllString(s, "<HEX>")
s = reNumbers.ReplaceAllString(s, "<N>")
s = strings.TrimSpace(s)
return strings.ToLower(s)
}
// formatSinceDuration converts a duration to docker logs --since format (e.g. "15m").
func formatSinceDuration(d time.Duration) string {
minutes := int(d.Minutes())
if minutes <= 0 {
minutes = 15
}
return fmt.Sprintf("%dm", minutes)
}
+66
View File
@@ -0,0 +1,66 @@
package metrics
import (
"time"
)
// ContainerTelemetry holds aggregated resource stats for one container.
type ContainerTelemetry struct {
ContainerName string `json:"container_name"`
MemoryCurrentMB float64 `json:"memory_current_mb"`
MemoryAvgMB float64 `json:"memory_avg_mb"`
MemoryPeakMB float64 `json:"memory_peak_mb"`
CPUAvgPercent float64 `json:"cpu_avg_percent"`
SampleCount int `json:"sample_count"`
}
// GetContainerTelemetry queries the metrics DB for per-container resource
// summaries since the given time. Returns empty slice (not error) if no data.
func (s *MetricsStore) GetContainerTelemetry(since time.Time) ([]ContainerTelemetry, error) {
sinceUnix := since.Unix()
rows, err := s.db.Query(`
SELECT container_name,
AVG(mem_usage_mb),
MAX(mem_usage_mb),
AVG(cpu_percent),
COUNT(*)
FROM container_metrics
WHERE ts > ?
GROUP BY container_name`, sinceUnix)
if err != nil {
return nil, err
}
defer rows.Close()
var results []ContainerTelemetry
for rows.Next() {
var ct ContainerTelemetry
if err := rows.Scan(&ct.ContainerName, &ct.MemoryAvgMB, &ct.MemoryPeakMB,
&ct.CPUAvgPercent, &ct.SampleCount); err != nil {
continue
}
results = append(results, ct)
}
if err := rows.Err(); err != nil {
return nil, err
}
// Get current (most recent) memory per container
if stats, err := s.QueryContainerSummary(); err == nil {
currentMap := make(map[string]float64, len(stats))
for _, st := range stats {
currentMap[st.ContainerName] = st.MemUsageMB
}
for i := range results {
if cur, ok := currentMap[results[i].ContainerName]; ok {
results[i].MemoryCurrentMB = cur
}
}
}
if results == nil {
results = []ContainerTelemetry{}
}
return results, nil
}