95c821deb2
Add detailed [DEBUG] logging to every controller module when logging.level is set to "debug". Each module with stateful debug uses SetDebug(bool) wired from main.go. Covers stacks, backup, cloudflare, integrations, system, monitor, settings, scheduler, web handlers, storage, metrics, API, selfupdate, and assets. Also includes the app export/import (.fab bundles) feature from v0.32.0 and its debug page integration. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
224 lines
6.2 KiB
Go
224 lines
6.2 KiB
Go
package metrics
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log"
|
|
"os/exec"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// ContainerLogSummary holds log analysis results for one container.
|
|
type ContainerLogSummary struct {
|
|
ContainerName string `json:"container_name"`
|
|
ErrorCount int `json:"error_count"`
|
|
WarnCount int `json:"warn_count"`
|
|
RecentIssues []LogIssue `json:"recent_issues,omitempty"`
|
|
}
|
|
|
|
// LogIssue represents a deduplicated log issue.
|
|
type LogIssue struct {
|
|
Severity string `json:"severity"`
|
|
Message string `json:"message"`
|
|
Count int `json:"count"`
|
|
LastSeen time.Time `json:"last_seen"`
|
|
}
|
|
|
|
var (
|
|
// Strip ANSI escape codes (color, bold, etc.)
|
|
reANSI = regexp.MustCompile(`\x1b\[[0-9;]*m`)
|
|
// Strip ISO timestamp: 2006-01-02T15:04:05 or 2006/01/02 15:04:05, with optional tz offset
|
|
reTimestamp = regexp.MustCompile(`\d{4}[-/]\d{2}[-/]\d{2}[T ]\d{2}:\d{2}:\d{2}[.\d]*([+-]\d{2}:?\d{2})?[Z ]?:? ?`)
|
|
// Strip syslog-style timestamp: Jan 2 15:04:05
|
|
reSyslog = regexp.MustCompile(`[A-Z][a-z]{2}\s+\d{1,2} \d{2}:\d{2}:\d{2} `)
|
|
// Replace 6+ digit sequences with <N> (avoids mangling 4-digit HTTP codes/ports)
|
|
reNumbers = regexp.MustCompile(`\b\d{6,}\b`)
|
|
// Replace 8+ char hex strings
|
|
reHex = regexp.MustCompile(`\b[0-9a-fA-F]{8,}\b`)
|
|
// Replace UUIDs
|
|
reUUID = regexp.MustCompile(`[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}`)
|
|
|
|
errorKeywords = []string{"error", "fatal", "panic", "crit", "oom", "killed", "exception", "traceback"}
|
|
warnKeywords = []string{"warn", "warning"}
|
|
)
|
|
|
|
// ScanContainerLogs scans docker logs for the given containers and returns a summary
|
|
// of errors/warnings found. Containers are scanned sequentially to avoid load spikes.
|
|
// The caller should filter out infrastructure/protected containers before calling this.
|
|
func ScanContainerLogs(containerNames []string, since time.Duration, logger *log.Logger) []ContainerLogSummary {
|
|
if len(containerNames) == 0 {
|
|
return []ContainerLogSummary{}
|
|
}
|
|
|
|
dbg := func(format string, args ...interface{}) {
|
|
if logger != nil {
|
|
logger.Printf("[DEBUG] [metrics] logscanner: "+format, args...)
|
|
}
|
|
}
|
|
|
|
dbg("starting log scan for %d containers (since %s)", len(containerNames), since)
|
|
start := time.Now()
|
|
results := make([]ContainerLogSummary, 0, len(containerNames))
|
|
|
|
for _, name := range containerNames {
|
|
cStart := time.Now()
|
|
summary := scanOneContainer(name, since, logger)
|
|
dbg("scanned %s: errors=%d warnings=%d issues=%d (took %s)",
|
|
name, summary.ErrorCount, summary.WarnCount, len(summary.RecentIssues), time.Since(cStart).Round(time.Millisecond))
|
|
results = append(results, summary)
|
|
}
|
|
|
|
elapsed := time.Since(start)
|
|
dbg("log scan completed: %d containers in %s", len(containerNames), elapsed.Round(time.Millisecond))
|
|
if elapsed > 5*time.Minute && logger != nil {
|
|
logger.Printf("[WARN] Log scan took %s (>5min) for %d containers", elapsed.Round(time.Second), len(containerNames))
|
|
}
|
|
|
|
return results
|
|
}
|
|
|
|
func scanOneContainer(name string, since time.Duration, logger *log.Logger) ContainerLogSummary {
|
|
summary := ContainerLogSummary{ContainerName: name}
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer cancel()
|
|
|
|
sinceStr := formatSinceDuration(since)
|
|
cmd := exec.CommandContext(ctx, "docker", "logs", "--since="+sinceStr, "--tail=1000", name)
|
|
output, err := cmd.CombinedOutput()
|
|
if err != nil {
|
|
if logger != nil {
|
|
logger.Printf("[DEBUG] logscanner: docker logs %s: %v", name, err)
|
|
}
|
|
return summary
|
|
}
|
|
|
|
// fingerprint → issue tracking
|
|
type issueEntry struct {
|
|
severity string
|
|
message string
|
|
count int
|
|
lastSeen time.Time
|
|
}
|
|
fingerprints := make(map[string]*issueEntry)
|
|
|
|
lines := strings.Split(string(output), "\n")
|
|
for _, line := range lines {
|
|
if !utf8.Valid([]byte(line)) {
|
|
continue
|
|
}
|
|
if len(line) > 500 {
|
|
line = line[:500]
|
|
}
|
|
if line == "" {
|
|
continue
|
|
}
|
|
|
|
severity := classifyLine(line)
|
|
if severity == "" {
|
|
continue
|
|
}
|
|
|
|
if severity == "error" {
|
|
summary.ErrorCount++
|
|
} else {
|
|
summary.WarnCount++
|
|
}
|
|
|
|
fp := fingerprint(line)
|
|
if e, ok := fingerprints[fp]; ok {
|
|
e.count++
|
|
e.lastSeen = time.Now()
|
|
} else {
|
|
msg := cleanLine(line)
|
|
if len(msg) > 200 {
|
|
msg = msg[:200]
|
|
}
|
|
fingerprints[fp] = &issueEntry{
|
|
severity: severity,
|
|
message: msg,
|
|
count: 1,
|
|
lastSeen: time.Now(),
|
|
}
|
|
}
|
|
}
|
|
|
|
// Convert map to slice, sort by count DESC then lastSeen DESC, cap at 10
|
|
issues := make([]LogIssue, 0, len(fingerprints))
|
|
for _, e := range fingerprints {
|
|
issues = append(issues, LogIssue{
|
|
Severity: e.severity,
|
|
Message: e.message,
|
|
Count: e.count,
|
|
LastSeen: e.lastSeen,
|
|
})
|
|
}
|
|
sort.Slice(issues, func(i, j int) bool {
|
|
if issues[i].Count != issues[j].Count {
|
|
return issues[i].Count > issues[j].Count
|
|
}
|
|
return issues[i].LastSeen.After(issues[j].LastSeen)
|
|
})
|
|
if len(issues) > 10 {
|
|
issues = issues[:10]
|
|
}
|
|
summary.RecentIssues = issues
|
|
|
|
return summary
|
|
}
|
|
|
|
// cleanLine strips ANSI escape codes and timestamps from a log line.
|
|
func cleanLine(line string) string {
|
|
s := reANSI.ReplaceAllString(line, "")
|
|
s = reTimestamp.ReplaceAllString(s, "")
|
|
s = reSyslog.ReplaceAllString(s, "")
|
|
return strings.TrimSpace(s)
|
|
}
|
|
|
|
// classifyLine returns "error", "warn", or "" based on first 5 words of the line.
|
|
func classifyLine(line string) string {
|
|
cleaned := reANSI.ReplaceAllString(line, "")
|
|
lower := strings.ToLower(cleaned)
|
|
words := strings.Fields(lower)
|
|
if len(words) > 5 {
|
|
words = words[:5]
|
|
}
|
|
prefix := strings.Join(words, " ")
|
|
|
|
for _, kw := range errorKeywords {
|
|
if strings.Contains(prefix, kw) {
|
|
return "error"
|
|
}
|
|
}
|
|
for _, kw := range warnKeywords {
|
|
if strings.Contains(prefix, kw) {
|
|
return "warn"
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// fingerprint produces a deduplication key for a log line.
|
|
func fingerprint(line string) string {
|
|
s := cleanLine(line)
|
|
// Replace UUIDs before hex to avoid partial matches
|
|
s = reUUID.ReplaceAllString(s, "<UUID>")
|
|
s = reHex.ReplaceAllString(s, "<HEX>")
|
|
s = reNumbers.ReplaceAllString(s, "<N>")
|
|
s = strings.TrimSpace(s)
|
|
return strings.ToLower(s)
|
|
}
|
|
|
|
// formatSinceDuration converts a duration to docker logs --since format (e.g. "15m").
|
|
func formatSinceDuration(d time.Duration) string {
|
|
minutes := int(d.Minutes())
|
|
if minutes <= 0 {
|
|
minutes = 15
|
|
}
|
|
return fmt.Sprintf("%dm", minutes)
|
|
}
|