telemetry: fix log deduplication — strip ANSI codes, tz offsets, mid-line timestamps (v0.30.6)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-25 16:01:53 +01:00
parent 17db33e419
commit 19f2c908fc
3 changed files with 28 additions and 12 deletions
+17 -11
View File
@@ -29,10 +29,12 @@ type LogIssue struct {
}
var (
// Strip leading ISO timestamp: 2006-01-02T15:04:05 or 2006/01/02 15:04:05 etc.
reTimestamp = regexp.MustCompile(`^\d{4}[-/]\d{2}[-/]\d{2}[T ]\d{2}:\d{2}:\d{2}[.\d]*[Z ]?`)
// Strip ANSI escape codes (color, bold, etc.)
reANSI = regexp.MustCompile(`\x1b\[[0-9;]*m`)
// Strip ISO timestamp: 2006-01-02T15:04:05 or 2006/01/02 15:04:05, with optional tz offset
reTimestamp = regexp.MustCompile(`\d{4}[-/]\d{2}[-/]\d{2}[T ]\d{2}:\d{2}:\d{2}[.\d]*([+-]\d{2}:?\d{2})?[Z ]?:? ?`)
// Strip syslog-style timestamp: Jan 2 15:04:05
reSyslog = regexp.MustCompile(`^[A-Z][a-z]{2}\s+\d{1,2} \d{2}:\d{2}:\d{2} `)
reSyslog = regexp.MustCompile(`[A-Z][a-z]{2}\s+\d{1,2} \d{2}:\d{2}:\d{2} `)
// Replace 6+ digit sequences with <N> (avoids mangling 4-digit HTTP codes/ports)
reNumbers = regexp.MustCompile(`\b\d{6,}\b`)
// Replace 8+ char hex strings
@@ -121,10 +123,7 @@ func scanOneContainer(name string, since time.Duration, logger *log.Logger) Cont
e.count++
e.lastSeen = time.Now()
} else {
// Use original line trimmed as message (strip timestamp)
msg := reTimestamp.ReplaceAllString(line, "")
msg = reSyslog.ReplaceAllString(msg, "")
msg = strings.TrimSpace(msg)
msg := cleanLine(line)
if len(msg) > 200 {
msg = msg[:200]
}
@@ -161,9 +160,18 @@ func scanOneContainer(name string, since time.Duration, logger *log.Logger) Cont
return summary
}
// cleanLine strips ANSI escape codes and timestamps from a log line.
func cleanLine(line string) string {
s := reANSI.ReplaceAllString(line, "")
s = reTimestamp.ReplaceAllString(s, "")
s = reSyslog.ReplaceAllString(s, "")
return strings.TrimSpace(s)
}
// classifyLine returns "error", "warn", or "" based on first 5 words of the line.
func classifyLine(line string) string {
lower := strings.ToLower(line)
cleaned := reANSI.ReplaceAllString(line, "")
lower := strings.ToLower(cleaned)
words := strings.Fields(lower)
if len(words) > 5 {
words = words[:5]
@@ -185,9 +193,7 @@ func classifyLine(line string) string {
// fingerprint produces a deduplication key for a log line.
func fingerprint(line string) string {
// Strip leading timestamp
s := reTimestamp.ReplaceAllString(line, "")
s = reSyslog.ReplaceAllString(s, "")
s := cleanLine(line)
// Replace UUIDs before hex to avoid partial matches
s = reUUID.ReplaceAllString(s, "<UUID>")
s = reHex.ReplaceAllString(s, "<HEX>")