diff --git a/CHANGELOG.md b/CHANGELOG.md index 34b5e29..984af78 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ ## Changelog +### v0.28.0 — App Telemetry & Analytics (2026-02-23) + +#### Added +- **App telemetry in Hub reports** — `Report.AppTelemetry` (new field in `report/types.go`) carries per-stack memory/CPU metrics and log scan results to the Hub on every report push. Backward-compatible: old Hub versions silently ignore the new field. +- **`internal/metrics/telemetry.go`** — New `MetricsStore.GetContainerTelemetry(since)` method aggregates container memory (current/avg/peak) and CPU averages from the existing `container_metrics` SQLite table over the last 15 minutes. +- **`internal/metrics/logscanner.go`** — New `ScanContainerLogs(containerNames, since, logger)` function runs `docker logs --since=15m --tail=1000` on each non-protected deployed container. Detects errors/warnings by keyword matching, deduplicates via fingerprinting (strips timestamps, replaces 6+ digit numbers with ``, hex with ``, UUIDs with ``). Returns `[]ContainerLogSummary` with counts and `RecentIssues` (top 10 per container). +- **`internal/report/telemetry.go`** — New `buildAppTelemetrySection()` and `buildAppTelemetry()` functions assemble per-stack `AppTelemetry` records by aggregating container-level metrics and log summaries. Only non-protected, deployed stacks are included. + +#### Changed +- **`internal/report/builder.go`** — `BuildReport()` now calls `buildAppTelemetrySection()` after the stacks section, populating `r.AppTelemetry`. +- **`internal/report/types.go`** — Added `AppTelemetry []AppTelemetry` field to `Report` struct. Added new `AppTelemetry` type with fields: app_name, display_name, containers, memory metrics, catalog estimate/limit, log error/warning counts, and top issues. + ### v0.27.3 — Real System Memory Everywhere (2026-02-23) #### Changed diff --git a/controller/README.md b/controller/README.md index 3aae6ee..528ffc6 100644 --- a/controller/README.md +++ b/controller/README.md @@ -4,7 +4,7 @@ A single, lightweight Go container that replaces Portainer + scattered systemd scripts with a unified, Hungarian-language web dashboard for managing Docker Compose stacks, backups, storage, monitoring, and notifications on customer hardware. -**Current version: v0.27.2** +**Current version: v0.28.0** --- @@ -846,9 +846,27 @@ Periodic JSON push (default every 15 min) to the central felhom-hub service: - Health: current status, issues, warnings - Stacks: deployed apps with versions and states - Config hash: SHA256 of `controller.yaml` for Hub-side config comparison +- **App telemetry** (v0.28.0+): Per-stack memory (current/avg/peak) and CPU averages from the last 15 minutes of metrics data, plus log scan results (error/warning counts with deduplicated issues). Only non-protected, deployed stacks are included. Backward-compatible: old Hub versions silently ignore this field. Bearer token authentication, 3-attempt retry with 5-second backoff. Push status tracked via `PushStatus` struct (LastAttempt, LastSuccess, LastError, consecutive failures) — used by the monitoring page and alert system to show Hub connection health. +#### App Telemetry (`internal/metrics/telemetry.go`, `internal/metrics/logscanner.go`, `internal/report/telemetry.go`) + +Each report push now includes per-app telemetry data: + +**Metrics collection** (`telemetry.go`): +- `MetricsStore.GetContainerTelemetry(since)` aggregates container-level memory (avg, peak, current) and CPU averages from the `container_metrics` SQLite table for the last 15 minutes. + +**Log scanning** (`logscanner.go`): +- `ScanContainerLogs(containerNames, since, logger)` runs `docker logs --since=15m --tail=1000` sequentially on all non-protected deployed containers. +- Classifies lines by keyword match (errors: `error`, `fatal`, `panic`, `crit`, `oom`, `killed`, `exception`, `traceback`; warnings: `warn`, `warning`) on the first 5 words (case-insensitive). +- Deduplicates via fingerprinting: strips timestamps, replaces 6+ digit numbers with ``, 8+ char hex with ``, UUIDs with ``. Groups identical fingerprints, keeps top 10 per container. +- Returns `[]ContainerLogSummary` with `ErrorCount`, `WarnCount`, `RecentIssues []LogIssue`. + +**Report integration** (`report/telemetry.go`): +- `buildAppTelemetrySection()` calls both, then `buildAppTelemetry()` aggregates by stack — summing container metrics, merging issues, capping at 10 per app. +- Results stored as `[]AppTelemetry` in the `Report` struct field `app_telemetry`. + #### Infrastructure Backup to Hub (`internal/report/infra_backup.go`) After each backup cycle (including manual Tier 2 triggers via `OnCrossDriveComplete` callback), the controller pushes a full infrastructure snapshot to the Hub for disaster recovery. This snapshot includes: diff --git a/controller/internal/metrics/logscanner.go b/controller/internal/metrics/logscanner.go new file mode 100644 index 0000000..6976ec8 --- /dev/null +++ b/controller/internal/metrics/logscanner.go @@ -0,0 +1,206 @@ +package metrics + +import ( + "context" + "fmt" + "log" + "os/exec" + "regexp" + "sort" + "strings" + "time" + "unicode/utf8" +) + +// ContainerLogSummary holds log analysis results for one container. +type ContainerLogSummary struct { + ContainerName string `json:"container_name"` + ErrorCount int `json:"error_count"` + WarnCount int `json:"warn_count"` + RecentIssues []LogIssue `json:"recent_issues,omitempty"` +} + +// LogIssue represents a deduplicated log issue. +type LogIssue struct { + Severity string `json:"severity"` + Message string `json:"message"` + Count int `json:"count"` + LastSeen time.Time `json:"last_seen"` +} + +var ( + // Strip leading ISO timestamp: 2006-01-02T15:04:05 or 2006/01/02 15:04:05 etc. + reTimestamp = regexp.MustCompile(`^\d{4}[-/]\d{2}[-/]\d{2}[T ]\d{2}:\d{2}:\d{2}[.\d]*[Z ]?`) + // Strip syslog-style timestamp: Jan 2 15:04:05 + reSyslog = regexp.MustCompile(`^[A-Z][a-z]{2}\s+\d{1,2} \d{2}:\d{2}:\d{2} `) + // Replace 6+ digit sequences with (avoids mangling 4-digit HTTP codes/ports) + reNumbers = regexp.MustCompile(`\b\d{6,}\b`) + // Replace 8+ char hex strings + reHex = regexp.MustCompile(`\b[0-9a-fA-F]{8,}\b`) + // Replace UUIDs + reUUID = regexp.MustCompile(`[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}`) + + errorKeywords = []string{"error", "fatal", "panic", "crit", "oom", "killed", "exception", "traceback"} + warnKeywords = []string{"warn", "warning"} +) + +// ScanContainerLogs scans docker logs for the given containers and returns a summary +// of errors/warnings found. Containers are scanned sequentially to avoid load spikes. +// The caller should filter out infrastructure/protected containers before calling this. +func ScanContainerLogs(containerNames []string, since time.Duration, logger *log.Logger) []ContainerLogSummary { + if len(containerNames) == 0 { + return []ContainerLogSummary{} + } + + start := time.Now() + results := make([]ContainerLogSummary, 0, len(containerNames)) + + for _, name := range containerNames { + summary := scanOneContainer(name, since, logger) + results = append(results, summary) + } + + elapsed := time.Since(start) + if elapsed > 5*time.Minute && logger != nil { + logger.Printf("[WARN] Log scan took %s (>5min) for %d containers", elapsed.Round(time.Second), len(containerNames)) + } + + return results +} + +func scanOneContainer(name string, since time.Duration, logger *log.Logger) ContainerLogSummary { + summary := ContainerLogSummary{ContainerName: name} + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + sinceStr := formatSinceDuration(since) + cmd := exec.CommandContext(ctx, "docker", "logs", "--since="+sinceStr, "--tail=1000", name) + output, err := cmd.CombinedOutput() + if err != nil { + if logger != nil { + logger.Printf("[DEBUG] logscanner: docker logs %s: %v", name, err) + } + return summary + } + + // fingerprint → issue tracking + type issueEntry struct { + severity string + message string + count int + lastSeen time.Time + } + fingerprints := make(map[string]*issueEntry) + + lines := strings.Split(string(output), "\n") + for _, line := range lines { + if !utf8.Valid([]byte(line)) { + continue + } + if len(line) > 500 { + line = line[:500] + } + if line == "" { + continue + } + + severity := classifyLine(line) + if severity == "" { + continue + } + + if severity == "error" { + summary.ErrorCount++ + } else { + summary.WarnCount++ + } + + fp := fingerprint(line) + if e, ok := fingerprints[fp]; ok { + e.count++ + e.lastSeen = time.Now() + } else { + // Use original line trimmed as message (strip timestamp) + msg := reTimestamp.ReplaceAllString(line, "") + msg = reSyslog.ReplaceAllString(msg, "") + msg = strings.TrimSpace(msg) + if len(msg) > 200 { + msg = msg[:200] + } + fingerprints[fp] = &issueEntry{ + severity: severity, + message: msg, + count: 1, + lastSeen: time.Now(), + } + } + } + + // Convert map to slice, sort by count DESC then lastSeen DESC, cap at 10 + issues := make([]LogIssue, 0, len(fingerprints)) + for _, e := range fingerprints { + issues = append(issues, LogIssue{ + Severity: e.severity, + Message: e.message, + Count: e.count, + LastSeen: e.lastSeen, + }) + } + sort.Slice(issues, func(i, j int) bool { + if issues[i].Count != issues[j].Count { + return issues[i].Count > issues[j].Count + } + return issues[i].LastSeen.After(issues[j].LastSeen) + }) + if len(issues) > 10 { + issues = issues[:10] + } + summary.RecentIssues = issues + + return summary +} + +// classifyLine returns "error", "warn", or "" based on first 5 words of the line. +func classifyLine(line string) string { + lower := strings.ToLower(line) + words := strings.Fields(lower) + if len(words) > 5 { + words = words[:5] + } + prefix := strings.Join(words, " ") + + for _, kw := range errorKeywords { + if strings.Contains(prefix, kw) { + return "error" + } + } + for _, kw := range warnKeywords { + if strings.Contains(prefix, kw) { + return "warn" + } + } + return "" +} + +// fingerprint produces a deduplication key for a log line. +func fingerprint(line string) string { + // Strip leading timestamp + s := reTimestamp.ReplaceAllString(line, "") + s = reSyslog.ReplaceAllString(s, "") + // Replace UUIDs before hex to avoid partial matches + s = reUUID.ReplaceAllString(s, "") + s = reHex.ReplaceAllString(s, "") + s = reNumbers.ReplaceAllString(s, "") + s = strings.TrimSpace(s) + return strings.ToLower(s) +} + +// formatSinceDuration converts a duration to docker logs --since format (e.g. "15m"). +func formatSinceDuration(d time.Duration) string { + minutes := int(d.Minutes()) + if minutes <= 0 { + minutes = 15 + } + return fmt.Sprintf("%dm", minutes) +} diff --git a/controller/internal/metrics/telemetry.go b/controller/internal/metrics/telemetry.go new file mode 100644 index 0000000..aceff11 --- /dev/null +++ b/controller/internal/metrics/telemetry.go @@ -0,0 +1,66 @@ +package metrics + +import ( + "time" +) + +// ContainerTelemetry holds aggregated resource stats for one container. +type ContainerTelemetry struct { + ContainerName string `json:"container_name"` + MemoryCurrentMB float64 `json:"memory_current_mb"` + MemoryAvgMB float64 `json:"memory_avg_mb"` + MemoryPeakMB float64 `json:"memory_peak_mb"` + CPUAvgPercent float64 `json:"cpu_avg_percent"` + SampleCount int `json:"sample_count"` +} + +// GetContainerTelemetry queries the metrics DB for per-container resource +// summaries since the given time. Returns empty slice (not error) if no data. +func (s *MetricsStore) GetContainerTelemetry(since time.Time) ([]ContainerTelemetry, error) { + sinceUnix := since.Unix() + + rows, err := s.db.Query(` + SELECT container_name, + AVG(mem_usage_mb), + MAX(mem_usage_mb), + AVG(cpu_percent), + COUNT(*) + FROM container_metrics + WHERE ts > ? + GROUP BY container_name`, sinceUnix) + if err != nil { + return nil, err + } + defer rows.Close() + + var results []ContainerTelemetry + for rows.Next() { + var ct ContainerTelemetry + if err := rows.Scan(&ct.ContainerName, &ct.MemoryAvgMB, &ct.MemoryPeakMB, + &ct.CPUAvgPercent, &ct.SampleCount); err != nil { + continue + } + results = append(results, ct) + } + if err := rows.Err(); err != nil { + return nil, err + } + + // Get current (most recent) memory per container + if stats, err := s.QueryContainerSummary(); err == nil { + currentMap := make(map[string]float64, len(stats)) + for _, st := range stats { + currentMap[st.ContainerName] = st.MemUsageMB + } + for i := range results { + if cur, ok := currentMap[results[i].ContainerName]; ok { + results[i].MemoryCurrentMB = cur + } + } + } + + if results == nil { + results = []ContainerTelemetry{} + } + return results, nil +} diff --git a/controller/internal/report/builder.go b/controller/internal/report/builder.go index 53f235e..7f8c902 100644 --- a/controller/internal/report/builder.go +++ b/controller/internal/report/builder.go @@ -150,9 +150,12 @@ func BuildReport( // Stacks r.Stacks = buildStacksReport(stackMgr) + // App telemetry (metrics + log scan) + r.AppTelemetry = buildAppTelemetrySection(stackMgr, metricsStore, logger) + if debug && logger != nil { - logger.Printf("[DEBUG] BuildReport: complete — containers=%d, health=%s, deployed=%d, available=%d", - r.Containers.Total, r.Health.Status, len(r.Stacks.Deployed), len(r.Stacks.Available)) + logger.Printf("[DEBUG] BuildReport: complete — containers=%d, health=%s, deployed=%d, available=%d, app_telemetry=%d", + r.Containers.Total, r.Health.Status, len(r.Stacks.Deployed), len(r.Stacks.Available), len(r.AppTelemetry)) } return r diff --git a/controller/internal/report/telemetry.go b/controller/internal/report/telemetry.go new file mode 100644 index 0000000..57d346d --- /dev/null +++ b/controller/internal/report/telemetry.go @@ -0,0 +1,119 @@ +package report + +import ( + "log" + "sort" + "time" + + "gitea.dooplex.hu/admin/felhom-controller/internal/metrics" + "gitea.dooplex.hu/admin/felhom-controller/internal/stacks" +) + +// buildAppTelemetrySection collects metrics telemetry and log scans for all +// non-protected, deployed stacks and returns per-app telemetry data. +func buildAppTelemetrySection(stackMgr *stacks.Manager, metricsStore *metrics.MetricsStore, logger *log.Logger) []AppTelemetry { + allStacks := stackMgr.GetStacks() + + // 1. Get metrics telemetry (last 15 minutes) + var telemetry []metrics.ContainerTelemetry + if metricsStore != nil { + var err error + telemetry, err = metricsStore.GetContainerTelemetry(time.Now().Add(-15 * time.Minute)) + if err != nil && logger != nil { + logger.Printf("[WARN] Telemetry metrics query failed: %v", err) + } + } + + // 2. Collect non-protected container names for log scan + var containerNames []string + for _, s := range allStacks { + if s.Protected || !s.Deployed { + continue + } + for _, c := range s.Containers { + containerNames = append(containerNames, c.Name) + } + } + + // 3. Scan logs + logs := metrics.ScanContainerLogs(containerNames, 15*time.Minute, logger) + + // 4. Build per-app telemetry + return buildAppTelemetry(allStacks, telemetry, logs) +} + +// buildAppTelemetry aggregates container-level telemetry and log data into per-stack AppTelemetry entries. +func buildAppTelemetry(allStacks []stacks.Stack, telemetry []metrics.ContainerTelemetry, logs []metrics.ContainerLogSummary) []AppTelemetry { + // Build lookup maps + telemetryMap := make(map[string]metrics.ContainerTelemetry, len(telemetry)) + for _, ct := range telemetry { + telemetryMap[ct.ContainerName] = ct + } + + logMap := make(map[string]metrics.ContainerLogSummary, len(logs)) + for _, ls := range logs { + logMap[ls.ContainerName] = ls + } + + var result []AppTelemetry + + for _, s := range allStacks { + if s.Protected || !s.Deployed { + continue + } + + app := AppTelemetry{ + AppName: s.Name, + DisplayName: s.Meta.DisplayName, + CatalogEstimate: s.Meta.Resources.MemRequest, + CatalogLimit: s.Meta.Resources.MemLimit, + } + + var mergedIssues []metrics.LogIssue + + for _, c := range s.Containers { + app.Containers = append(app.Containers, c.Name) + + if ct, ok := telemetryMap[c.Name]; ok { + app.MemoryCurrentMB += ct.MemoryCurrentMB + app.MemoryAvgMB += ct.MemoryAvgMB + app.MemoryPeakMB += ct.MemoryPeakMB + app.CPUAvgPercent += ct.CPUAvgPercent + } + + if ls, ok := logMap[c.Name]; ok { + app.LogErrors += ls.ErrorCount + app.LogWarnings += ls.WarnCount + mergedIssues = append(mergedIssues, ls.RecentIssues...) + } + } + + if app.Containers == nil { + app.Containers = []string{} + } + + // Sort and cap merged issues at 10 + sort.Slice(mergedIssues, func(i, j int) bool { + if mergedIssues[i].Count != mergedIssues[j].Count { + return mergedIssues[i].Count > mergedIssues[j].Count + } + return mergedIssues[i].LastSeen.After(mergedIssues[j].LastSeen) + }) + if len(mergedIssues) > 10 { + mergedIssues = mergedIssues[:10] + } + app.Issues = mergedIssues + + result = append(result, app) + } + + // Sort by app name + sort.Slice(result, func(i, j int) bool { + return result[i].AppName < result[j].AppName + }) + + if result == nil { + result = []AppTelemetry{} + } + return result +} diff --git a/controller/internal/report/types.go b/controller/internal/report/types.go index cf5d68a..44d6d76 100644 --- a/controller/internal/report/types.go +++ b/controller/internal/report/types.go @@ -1,6 +1,10 @@ package report -import "time" +import ( + "time" + + "gitea.dooplex.hu/admin/felhom-controller/internal/metrics" +) // Report is the JSON payload pushed to the central hub. type Report struct { @@ -18,6 +22,7 @@ type Report struct { Backup BackupReport `json:"backup"` Health HealthReport `json:"health"` Stacks StacksReport `json:"stacks"` + AppTelemetry []AppTelemetry `json:"app_telemetry,omitempty"` } // SystemReport holds host-level system info. @@ -91,3 +96,19 @@ type StacksReport struct { Deployed []string `json:"deployed"` Available []string `json:"available"` } + +// AppTelemetry holds per-app (per-stack) resource and log telemetry. +type AppTelemetry struct { + AppName string `json:"app_name"` + DisplayName string `json:"display_name"` + Containers []string `json:"containers"` + MemoryCurrentMB float64 `json:"memory_current_mb"` + MemoryAvgMB float64 `json:"memory_avg_mb"` + MemoryPeakMB float64 `json:"memory_peak_mb"` + CPUAvgPercent float64 `json:"cpu_avg_percent"` + CatalogEstimate string `json:"catalog_estimate"` + CatalogLimit string `json:"catalog_limit"` + LogErrors int `json:"log_errors"` + LogWarnings int `json:"log_warnings"` + Issues []metrics.LogIssue `json:"issues,omitempty"` +}