af1dd14933
Second-pass logging cleanup: consistent [LEVEL] [module] format across all 41 files. Remove stale prefixes ([CF], [SYNC], [SCHED], [API], [STORAGE], [HEALTH], [ROLLBACK]). Remove 5 duplicate log lines. Gate ungated DEBUG lines. Fix wrong log levels (restore start WARN→INFO). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
224 lines
6.3 KiB
Go
224 lines
6.3 KiB
Go
package metrics
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log"
|
|
"os/exec"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// ContainerLogSummary holds log analysis results for one container.
|
|
type ContainerLogSummary struct {
|
|
ContainerName string `json:"container_name"`
|
|
ErrorCount int `json:"error_count"`
|
|
WarnCount int `json:"warn_count"`
|
|
RecentIssues []LogIssue `json:"recent_issues,omitempty"`
|
|
}
|
|
|
|
// LogIssue represents a deduplicated log issue.
|
|
type LogIssue struct {
|
|
Severity string `json:"severity"`
|
|
Message string `json:"message"`
|
|
Count int `json:"count"`
|
|
LastSeen time.Time `json:"last_seen"`
|
|
}
|
|
|
|
var (
|
|
// Strip ANSI escape codes (color, bold, etc.)
|
|
reANSI = regexp.MustCompile(`\x1b\[[0-9;]*m`)
|
|
// Strip ISO timestamp: 2006-01-02T15:04:05 or 2006/01/02 15:04:05, with optional tz offset
|
|
reTimestamp = regexp.MustCompile(`\d{4}[-/]\d{2}[-/]\d{2}[T ]\d{2}:\d{2}:\d{2}[.\d]*([+-]\d{2}:?\d{2})?[Z ]?:? ?`)
|
|
// Strip syslog-style timestamp: Jan 2 15:04:05
|
|
reSyslog = regexp.MustCompile(`[A-Z][a-z]{2}\s+\d{1,2} \d{2}:\d{2}:\d{2} `)
|
|
// Replace 6+ digit sequences with <N> (avoids mangling 4-digit HTTP codes/ports)
|
|
reNumbers = regexp.MustCompile(`\b\d{6,}\b`)
|
|
// Replace 8+ char hex strings
|
|
reHex = regexp.MustCompile(`\b[0-9a-fA-F]{8,}\b`)
|
|
// Replace UUIDs
|
|
reUUID = regexp.MustCompile(`[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}`)
|
|
|
|
errorKeywords = []string{"error", "fatal", "panic", "crit", "oom", "killed", "exception", "traceback"}
|
|
warnKeywords = []string{"warn", "warning"}
|
|
)
|
|
|
|
// ScanContainerLogs scans docker logs for the given containers and returns a summary
|
|
// of errors/warnings found. Containers are scanned sequentially to avoid load spikes.
|
|
// The caller should filter out infrastructure/protected containers before calling this.
|
|
func ScanContainerLogs(containerNames []string, since time.Duration, logger *log.Logger) []ContainerLogSummary {
|
|
if len(containerNames) == 0 {
|
|
return []ContainerLogSummary{}
|
|
}
|
|
|
|
dbg := func(format string, args ...interface{}) {
|
|
if logger != nil {
|
|
logger.Printf("[DEBUG] [metrics] logscanner: "+format, args...)
|
|
}
|
|
}
|
|
|
|
dbg("starting log scan for %d containers (since %s)", len(containerNames), since)
|
|
start := time.Now()
|
|
results := make([]ContainerLogSummary, 0, len(containerNames))
|
|
|
|
for _, name := range containerNames {
|
|
cStart := time.Now()
|
|
summary := scanOneContainer(name, since, logger)
|
|
dbg("scanned %s: errors=%d warnings=%d issues=%d (took %s)",
|
|
name, summary.ErrorCount, summary.WarnCount, len(summary.RecentIssues), time.Since(cStart).Round(time.Millisecond))
|
|
results = append(results, summary)
|
|
}
|
|
|
|
elapsed := time.Since(start)
|
|
dbg("log scan completed: %d containers in %s", len(containerNames), elapsed.Round(time.Millisecond))
|
|
if elapsed > 5*time.Minute && logger != nil {
|
|
logger.Printf("[WARN] [metrics] Log scan took %s (>5min) for %d containers", elapsed.Round(time.Second), len(containerNames))
|
|
}
|
|
|
|
return results
|
|
}
|
|
|
|
func scanOneContainer(name string, since time.Duration, logger *log.Logger) ContainerLogSummary {
|
|
summary := ContainerLogSummary{ContainerName: name}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer cancel()
|
|
|
|
sinceStr := formatSinceDuration(since)
|
|
cmd := exec.CommandContext(ctx, "docker", "logs", "--since="+sinceStr, "--tail=1000", name)
|
|
output, err := cmd.CombinedOutput()
|
|
if err != nil {
|
|
if logger != nil {
|
|
logger.Printf("[DEBUG] [metrics] logscanner: docker logs %s: %v", name, err)
|
|
}
|
|
return summary
|
|
}
|
|
|
|
// fingerprint → issue tracking
|
|
type issueEntry struct {
|
|
severity string
|
|
message string
|
|
count int
|
|
lastSeen time.Time
|
|
}
|
|
fingerprints := make(map[string]*issueEntry)
|
|
|
|
lines := strings.Split(string(output), "\n")
|
|
for _, line := range lines {
|
|
if !utf8.Valid([]byte(line)) {
|
|
continue
|
|
}
|
|
if len(line) > 500 {
|
|
line = line[:500]
|
|
}
|
|
if line == "" {
|
|
continue
|
|
}
|
|
|
|
severity := classifyLine(line)
|
|
if severity == "" {
|
|
continue
|
|
}
|
|
|
|
if severity == "error" {
|
|
summary.ErrorCount++
|
|
} else {
|
|
summary.WarnCount++
|
|
}
|
|
|
|
fp := fingerprint(line)
|
|
if e, ok := fingerprints[fp]; ok {
|
|
e.count++
|
|
e.lastSeen = time.Now()
|
|
} else {
|
|
msg := cleanLine(line)
|
|
if len(msg) > 200 {
|
|
msg = msg[:200]
|
|
}
|
|
fingerprints[fp] = &issueEntry{
|
|
severity: severity,
|
|
message: msg,
|
|
count: 1,
|
|
lastSeen: time.Now(),
|
|
}
|
|
}
|
|
}
|
|
|
|
// Convert map to slice, sort by count DESC then lastSeen DESC, cap at 10
|
|
issues := make([]LogIssue, 0, len(fingerprints))
|
|
for _, e := range fingerprints {
|
|
issues = append(issues, LogIssue{
|
|
Severity: e.severity,
|
|
Message: e.message,
|
|
Count: e.count,
|
|
LastSeen: e.lastSeen,
|
|
})
|
|
}
|
|
sort.Slice(issues, func(i, j int) bool {
|
|
if issues[i].Count != issues[j].Count {
|
|
return issues[i].Count > issues[j].Count
|
|
}
|
|
return issues[i].LastSeen.After(issues[j].LastSeen)
|
|
})
|
|
if len(issues) > 10 {
|
|
issues = issues[:10]
|
|
}
|
|
summary.RecentIssues = issues
|
|
|
|
return summary
|
|
}
|
|
|
|
// cleanLine strips ANSI escape codes and timestamps from a log line.
|
|
func cleanLine(line string) string {
|
|
s := reANSI.ReplaceAllString(line, "")
|
|
s = reTimestamp.ReplaceAllString(s, "")
|
|
s = reSyslog.ReplaceAllString(s, "")
|
|
return strings.TrimSpace(s)
|
|
}
|
|
|
|
// classifyLine returns "error", "warn", or "" based on first 5 words of the line.
|
|
func classifyLine(line string) string {
|
|
cleaned := reANSI.ReplaceAllString(line, "")
|
|
lower := strings.ToLower(cleaned)
|
|
words := strings.Fields(lower)
|
|
if len(words) > 5 {
|
|
words = words[:5]
|
|
}
|
|
prefix := strings.Join(words, " ")
|
|
|
|
for _, kw := range errorKeywords {
|
|
if strings.Contains(prefix, kw) {
|
|
return "error"
|
|
}
|
|
}
|
|
for _, kw := range warnKeywords {
|
|
if strings.Contains(prefix, kw) {
|
|
return "warn"
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// fingerprint produces a deduplication key for a log line.
|
|
func fingerprint(line string) string {
|
|
s := cleanLine(line)
|
|
// Replace UUIDs before hex to avoid partial matches
|
|
s = reUUID.ReplaceAllString(s, "<UUID>")
|
|
s = reHex.ReplaceAllString(s, "<HEX>")
|
|
s = reNumbers.ReplaceAllString(s, "<N>")
|
|
s = strings.TrimSpace(s)
|
|
return strings.ToLower(s)
|
|
}
|
|
|
|
// formatSinceDuration converts a duration to docker logs --since format (e.g. "15m").
|
|
func formatSinceDuration(d time.Duration) string {
|
|
minutes := int(d.Minutes())
|
|
if minutes <= 0 {
|
|
minutes = 15
|
|
}
|
|
return fmt.Sprintf("%dm", minutes)
|
|
}
|