feat(telemetry): add per-app metrics and log telemetry to hub reports (v0.28.0)
- New internal/metrics/telemetry.go: MetricsStore.GetContainerTelemetry() aggregates container memory/CPU from SQLite over the last 15 min - New internal/metrics/logscanner.go: ScanContainerLogs() scans docker logs for errors/warnings, deduplicates via fingerprinting (strips timestamps, replaces 6+ digit numbers, hex strings, UUIDs) - New internal/report/telemetry.go: buildAppTelemetrySection() assembles per-stack AppTelemetry by aggregating container metrics and log summaries - internal/report/types.go: added AppTelemetry field to Report struct plus AppTelemetry type with memory/CPU/log fields and LogIssue references - internal/report/builder.go: calls buildAppTelemetrySection() in BuildReport() - Backward-compatible: old Hub versions silently ignore app_telemetry field Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,206 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// ContainerLogSummary holds log analysis results for one container.
|
||||
type ContainerLogSummary struct {
|
||||
ContainerName string `json:"container_name"`
|
||||
ErrorCount int `json:"error_count"`
|
||||
WarnCount int `json:"warn_count"`
|
||||
RecentIssues []LogIssue `json:"recent_issues,omitempty"`
|
||||
}
|
||||
|
||||
// LogIssue represents a deduplicated log issue.
|
||||
type LogIssue struct {
|
||||
Severity string `json:"severity"`
|
||||
Message string `json:"message"`
|
||||
Count int `json:"count"`
|
||||
LastSeen time.Time `json:"last_seen"`
|
||||
}
|
||||
|
||||
var (
|
||||
// Strip leading ISO timestamp: 2006-01-02T15:04:05 or 2006/01/02 15:04:05 etc.
|
||||
reTimestamp = regexp.MustCompile(`^\d{4}[-/]\d{2}[-/]\d{2}[T ]\d{2}:\d{2}:\d{2}[.\d]*[Z ]?`)
|
||||
// Strip syslog-style timestamp: Jan 2 15:04:05
|
||||
reSyslog = regexp.MustCompile(`^[A-Z][a-z]{2}\s+\d{1,2} \d{2}:\d{2}:\d{2} `)
|
||||
// Replace 6+ digit sequences with <N> (avoids mangling 4-digit HTTP codes/ports)
|
||||
reNumbers = regexp.MustCompile(`\b\d{6,}\b`)
|
||||
// Replace 8+ char hex strings
|
||||
reHex = regexp.MustCompile(`\b[0-9a-fA-F]{8,}\b`)
|
||||
// Replace UUIDs
|
||||
reUUID = regexp.MustCompile(`[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}`)
|
||||
|
||||
errorKeywords = []string{"error", "fatal", "panic", "crit", "oom", "killed", "exception", "traceback"}
|
||||
warnKeywords = []string{"warn", "warning"}
|
||||
)
|
||||
|
||||
// ScanContainerLogs scans docker logs for the given containers and returns a summary
|
||||
// of errors/warnings found. Containers are scanned sequentially to avoid load spikes.
|
||||
// The caller should filter out infrastructure/protected containers before calling this.
|
||||
func ScanContainerLogs(containerNames []string, since time.Duration, logger *log.Logger) []ContainerLogSummary {
|
||||
if len(containerNames) == 0 {
|
||||
return []ContainerLogSummary{}
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
results := make([]ContainerLogSummary, 0, len(containerNames))
|
||||
|
||||
for _, name := range containerNames {
|
||||
summary := scanOneContainer(name, since, logger)
|
||||
results = append(results, summary)
|
||||
}
|
||||
|
||||
elapsed := time.Since(start)
|
||||
if elapsed > 5*time.Minute && logger != nil {
|
||||
logger.Printf("[WARN] Log scan took %s (>5min) for %d containers", elapsed.Round(time.Second), len(containerNames))
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
func scanOneContainer(name string, since time.Duration, logger *log.Logger) ContainerLogSummary {
|
||||
summary := ContainerLogSummary{ContainerName: name}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
sinceStr := formatSinceDuration(since)
|
||||
cmd := exec.CommandContext(ctx, "docker", "logs", "--since="+sinceStr, "--tail=1000", name)
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
if logger != nil {
|
||||
logger.Printf("[DEBUG] logscanner: docker logs %s: %v", name, err)
|
||||
}
|
||||
return summary
|
||||
}
|
||||
|
||||
// fingerprint → issue tracking
|
||||
type issueEntry struct {
|
||||
severity string
|
||||
message string
|
||||
count int
|
||||
lastSeen time.Time
|
||||
}
|
||||
fingerprints := make(map[string]*issueEntry)
|
||||
|
||||
lines := strings.Split(string(output), "\n")
|
||||
for _, line := range lines {
|
||||
if !utf8.Valid([]byte(line)) {
|
||||
continue
|
||||
}
|
||||
if len(line) > 500 {
|
||||
line = line[:500]
|
||||
}
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
severity := classifyLine(line)
|
||||
if severity == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
if severity == "error" {
|
||||
summary.ErrorCount++
|
||||
} else {
|
||||
summary.WarnCount++
|
||||
}
|
||||
|
||||
fp := fingerprint(line)
|
||||
if e, ok := fingerprints[fp]; ok {
|
||||
e.count++
|
||||
e.lastSeen = time.Now()
|
||||
} else {
|
||||
// Use original line trimmed as message (strip timestamp)
|
||||
msg := reTimestamp.ReplaceAllString(line, "")
|
||||
msg = reSyslog.ReplaceAllString(msg, "")
|
||||
msg = strings.TrimSpace(msg)
|
||||
if len(msg) > 200 {
|
||||
msg = msg[:200]
|
||||
}
|
||||
fingerprints[fp] = &issueEntry{
|
||||
severity: severity,
|
||||
message: msg,
|
||||
count: 1,
|
||||
lastSeen: time.Now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Convert map to slice, sort by count DESC then lastSeen DESC, cap at 10
|
||||
issues := make([]LogIssue, 0, len(fingerprints))
|
||||
for _, e := range fingerprints {
|
||||
issues = append(issues, LogIssue{
|
||||
Severity: e.severity,
|
||||
Message: e.message,
|
||||
Count: e.count,
|
||||
LastSeen: e.lastSeen,
|
||||
})
|
||||
}
|
||||
sort.Slice(issues, func(i, j int) bool {
|
||||
if issues[i].Count != issues[j].Count {
|
||||
return issues[i].Count > issues[j].Count
|
||||
}
|
||||
return issues[i].LastSeen.After(issues[j].LastSeen)
|
||||
})
|
||||
if len(issues) > 10 {
|
||||
issues = issues[:10]
|
||||
}
|
||||
summary.RecentIssues = issues
|
||||
|
||||
return summary
|
||||
}
|
||||
|
||||
// classifyLine returns "error", "warn", or "" based on first 5 words of the line.
|
||||
func classifyLine(line string) string {
|
||||
lower := strings.ToLower(line)
|
||||
words := strings.Fields(lower)
|
||||
if len(words) > 5 {
|
||||
words = words[:5]
|
||||
}
|
||||
prefix := strings.Join(words, " ")
|
||||
|
||||
for _, kw := range errorKeywords {
|
||||
if strings.Contains(prefix, kw) {
|
||||
return "error"
|
||||
}
|
||||
}
|
||||
for _, kw := range warnKeywords {
|
||||
if strings.Contains(prefix, kw) {
|
||||
return "warn"
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// fingerprint produces a deduplication key for a log line.
|
||||
func fingerprint(line string) string {
|
||||
// Strip leading timestamp
|
||||
s := reTimestamp.ReplaceAllString(line, "")
|
||||
s = reSyslog.ReplaceAllString(s, "")
|
||||
// Replace UUIDs before hex to avoid partial matches
|
||||
s = reUUID.ReplaceAllString(s, "<UUID>")
|
||||
s = reHex.ReplaceAllString(s, "<HEX>")
|
||||
s = reNumbers.ReplaceAllString(s, "<N>")
|
||||
s = strings.TrimSpace(s)
|
||||
return strings.ToLower(s)
|
||||
}
|
||||
|
||||
// formatSinceDuration converts a duration to docker logs --since format (e.g. "15m").
|
||||
func formatSinceDuration(d time.Duration) string {
|
||||
minutes := int(d.Minutes())
|
||||
if minutes <= 0 {
|
||||
minutes = 15
|
||||
}
|
||||
return fmt.Sprintf("%dm", minutes)
|
||||
}
|
||||
Reference in New Issue
Block a user