package monitor import ( "fmt" "os" "os/exec" "strings" "time" "gitea.dooplex.hu/admin/felhom-controller/internal/config" "gitea.dooplex.hu/admin/felhom-controller/internal/settings" "gitea.dooplex.hu/admin/felhom-controller/internal/system" ) // HealthReport contains the results of a system health check. type HealthReport struct { Status string // "ok", "warn", "fail" Issues []string // critical problems Warnings []string // non-critical warnings Info []string // informational items Timestamp time.Time } // RunHealthCheck runs system checks and returns a diagnostic report. func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, storagePaths []settings.StoragePath) *HealthReport { report := &HealthReport{ Status: "ok", Timestamp: time.Now(), } hddPath := cfg.Paths.HDDPath if len(storagePaths) > 0 { hddPath = storagePaths[0].Path } sysInfo := system.GetInfo(hddPath, cpuCollector) // 1. Disk usage (SSD) if sysInfo.DiskPercent > 0 { if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskCritPercent) { report.Issues = append(report.Issues, fmt.Sprintf("SSD disk usage critical: %.0f%%", sysInfo.DiskPercent)) } else if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskWarnPercent) { report.Warnings = append(report.Warnings, fmt.Sprintf("SSD disk usage high: %.0f%%", sysInfo.DiskPercent)) } else { report.Info = append(report.Info, fmt.Sprintf("SSD: %.0f%% used", sysInfo.DiskPercent)) } } // HDD disk usage if sysInfo.HDDConfigured && sysInfo.HDDPercent > 0 { if sysInfo.HDDPercent >= float64(cfg.Monitoring.Thresholds.DiskCritPercent) { report.Issues = append(report.Issues, fmt.Sprintf("HDD disk usage critical: %.0f%%", sysInfo.HDDPercent)) } else if sysInfo.HDDPercent >= float64(cfg.Monitoring.Thresholds.DiskWarnPercent) { report.Warnings = append(report.Warnings, fmt.Sprintf("HDD disk usage high: %.0f%%", sysInfo.HDDPercent)) } } // 2. Memory usage if sysInfo.MemPercent > 0 { if sysInfo.MemPercent >= float64(cfg.Monitoring.Thresholds.MemoryWarnPercent) { report.Warnings = append(report.Warnings, fmt.Sprintf("Memory usage high: %.0f%%", sysInfo.MemPercent)) } else { report.Info = append(report.Info, fmt.Sprintf("Memory: %.0f%% used", sysInfo.MemPercent)) } } // 3. CPU usage if sysInfo.CPUPercent > 0 { if sysInfo.CPUPercent >= float64(cfg.Monitoring.Thresholds.CPUWarnPercent) { report.Warnings = append(report.Warnings, fmt.Sprintf("CPU usage high: %.0f%%", sysInfo.CPUPercent)) } else { report.Info = append(report.Info, fmt.Sprintf("CPU: %.0f%%", sysInfo.CPUPercent)) } } // 4. Temperature if sysInfo.TemperatureCelsius > 0 { if sysInfo.TemperatureCelsius >= float64(cfg.Monitoring.Thresholds.TemperatureWarnCelsius) { report.Warnings = append(report.Warnings, fmt.Sprintf("Temperature high: %.0f°C (%s)", sysInfo.TemperatureCelsius, sysInfo.TemperatureSource)) } else { report.Info = append(report.Info, fmt.Sprintf("Temperature: %.0f°C", sysInfo.TemperatureCelsius)) } } // 5. Docker health if err := checkDocker(); err != nil { report.Issues = append(report.Issues, fmt.Sprintf("Docker: %v", err)) } else { report.Info = append(report.Info, "Docker: reachable") } // 6. Protected containers missingProtected := checkProtectedContainers(cfg.Stacks.Protected) for _, name := range missingProtected { report.Issues = append(report.Issues, fmt.Sprintf("Protected container not running: %s", name)) } // 7. Storage paths storageIssues, storageWarnings := checkStoragePaths(storagePaths) report.Issues = append(report.Issues, storageIssues...) report.Warnings = append(report.Warnings, storageWarnings...) // Determine status if len(report.Issues) > 0 { report.Status = "fail" } else if len(report.Warnings) > 0 { report.Status = "warn" } return report } // FormatMessage returns a human-readable summary for healthcheck ping body. func (r *HealthReport) FormatMessage() string { var sb strings.Builder sb.WriteString(fmt.Sprintf("Status: %s\n", strings.ToUpper(r.Status))) sb.WriteString(fmt.Sprintf("Time: %s\n\n", r.Timestamp.Format("2006-01-02 15:04:05"))) if len(r.Issues) > 0 { sb.WriteString("ISSUES:\n") for _, issue := range r.Issues { sb.WriteString(" - " + issue + "\n") } sb.WriteString("\n") } if len(r.Warnings) > 0 { sb.WriteString("WARNINGS:\n") for _, w := range r.Warnings { sb.WriteString(" - " + w + "\n") } sb.WriteString("\n") } if len(r.Info) > 0 { sb.WriteString("INFO:\n") for _, info := range r.Info { sb.WriteString(" - " + info + "\n") } } return sb.String() } func checkDocker() error { cmd := exec.Command("docker", "info", "--format", "{{.ServerVersion}}") out, err := cmd.Output() if err != nil { return fmt.Errorf("docker not reachable: %v", err) } if len(strings.TrimSpace(string(out))) == 0 { return fmt.Errorf("docker returned empty version") } return nil } func checkProtectedContainers(protected []string) []string { var missing []string for _, name := range protected { cmd := exec.Command("docker", "inspect", "--format", "{{.State.Running}}", name) out, err := cmd.Output() if err != nil { missing = append(missing, name) continue } if strings.TrimSpace(string(out)) != "true" { missing = append(missing, name) } } return missing } func checkStoragePaths(paths []settings.StoragePath) (issues, warnings []string) { for _, sp := range paths { // Skip decommissioned paths — no longer in active use if sp.Decommissioned { continue } // Skip disconnected paths — handled by the storage watchdog if sp.Disconnected { warnings = append(warnings, fmt.Sprintf("Meghajtó leválasztva: %s (%s)", sp.Label, sp.Path)) continue } // Path accessible? if _, err := os.Stat(sp.Path); err != nil { warnings = append(warnings, fmt.Sprintf("Adattároló nem elérhető: %s", sp.Path)) continue } // Mount point check — warning, not issue (avoids false FAIL on demo/test environments) if !system.IsMountPoint(sp.Path) { warnings = append(warnings, fmt.Sprintf( "Az adattároló (%s) nem külön meghajtón van — az adatok a rendszermeghajtóra íródnak", sp.Path)) } // Disk usage if di := system.GetDiskUsage(sp.Path); di != nil { if di.UsedPercent >= 95 { issues = append(issues, fmt.Sprintf("Adattároló majdnem megtelt: %s (%.0f%%)", sp.Path, di.UsedPercent)) } else if di.UsedPercent >= 90 { warnings = append(warnings, fmt.Sprintf("Adattároló használat magas: %s (%.0f%%)", sp.Path, di.UsedPercent)) } } } return }