af1dd14933
Second-pass logging cleanup: consistent [LEVEL] [module] format across all 41 files. Remove stale prefixes ([CF], [SYNC], [SCHED], [API], [STORAGE], [HEALTH], [ROLLBACK]). Remove 5 duplicate log lines. Gate ungated DEBUG lines. Fix wrong log levels (restore start WARN→INFO). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
292 lines
10 KiB
Go
292 lines
10 KiB
Go
package monitor
|
|
|
|
import (
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"os/exec"
|
|
"strings"
|
|
"time"
|
|
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/settings"
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
|
|
)
|
|
|
|
// HealthReport contains the results of a system health check.
|
|
type HealthReport struct {
|
|
Status string // "ok", "warn", "fail"
|
|
Issues []string // critical problems
|
|
Warnings []string // non-critical warnings
|
|
Info []string // informational items
|
|
Timestamp time.Time
|
|
}
|
|
|
|
// RunHealthCheck runs system checks and returns a diagnostic report.
|
|
func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, storagePaths []settings.StoragePath, logger *log.Logger) *HealthReport {
|
|
report := &HealthReport{
|
|
Status: "ok",
|
|
Timestamp: time.Now(),
|
|
}
|
|
|
|
debug := cfg.Logging.Level == "debug" && logger != nil
|
|
|
|
hddPath := cfg.Paths.HDDPath
|
|
if len(storagePaths) > 0 {
|
|
hddPath = storagePaths[0].Path
|
|
}
|
|
sysInfo := system.GetInfo(hddPath, cpuCollector)
|
|
|
|
if debug {
|
|
logger.Printf("[DEBUG] [monitor] Raw values: disk=%.1f%%, hdd=%.1f%% (configured=%v), mem=%.1f%% (%dMB/%dMB), cpu=%.1f%%, temp=%.1f°C (%s)",
|
|
sysInfo.DiskPercent, sysInfo.HDDPercent, sysInfo.HDDConfigured,
|
|
sysInfo.MemPercent, sysInfo.UsedMemMB, sysInfo.TotalMemMB,
|
|
sysInfo.CPUPercent, sysInfo.TemperatureCelsius, sysInfo.TemperatureSource)
|
|
}
|
|
|
|
// 1. Disk usage (SSD)
|
|
if sysInfo.DiskPercent > 0 {
|
|
if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskCritPercent) {
|
|
report.Issues = append(report.Issues, fmt.Sprintf("SSD disk usage critical: %.0f%%", sysInfo.DiskPercent))
|
|
if logger != nil {
|
|
logger.Printf("[WARN] [monitor] Disk (SSD) threshold breached: %.0f%% (limit: %d%%)", sysInfo.DiskPercent, cfg.Monitoring.Thresholds.DiskCritPercent)
|
|
}
|
|
if debug {
|
|
logger.Printf("[DEBUG] [monitor] SSD disk: CRITICAL (%.0f%% >= %d%%)", sysInfo.DiskPercent, cfg.Monitoring.Thresholds.DiskCritPercent)
|
|
}
|
|
} else if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskWarnPercent) {
|
|
report.Warnings = append(report.Warnings, fmt.Sprintf("SSD disk usage high: %.0f%%", sysInfo.DiskPercent))
|
|
if logger != nil {
|
|
logger.Printf("[WARN] [monitor] Disk (SSD) threshold breached: %.0f%% (limit: %d%%)", sysInfo.DiskPercent, cfg.Monitoring.Thresholds.DiskWarnPercent)
|
|
}
|
|
if debug {
|
|
logger.Printf("[DEBUG] [monitor] SSD disk: WARN (%.0f%% >= %d%%)", sysInfo.DiskPercent, cfg.Monitoring.Thresholds.DiskWarnPercent)
|
|
}
|
|
} else {
|
|
report.Info = append(report.Info, fmt.Sprintf("SSD: %.0f%% used", sysInfo.DiskPercent))
|
|
if debug {
|
|
logger.Printf("[DEBUG] [monitor] SSD disk: OK (%.0f%%)", sysInfo.DiskPercent)
|
|
}
|
|
}
|
|
}
|
|
|
|
// HDD disk usage
|
|
if sysInfo.HDDConfigured && sysInfo.HDDPercent > 0 {
|
|
if sysInfo.HDDPercent >= float64(cfg.Monitoring.Thresholds.DiskCritPercent) {
|
|
report.Issues = append(report.Issues, fmt.Sprintf("HDD disk usage critical: %.0f%%", sysInfo.HDDPercent))
|
|
if logger != nil {
|
|
logger.Printf("[WARN] [monitor] Disk (HDD) threshold breached: %.0f%% (limit: %d%%)", sysInfo.HDDPercent, cfg.Monitoring.Thresholds.DiskCritPercent)
|
|
}
|
|
} else if sysInfo.HDDPercent >= float64(cfg.Monitoring.Thresholds.DiskWarnPercent) {
|
|
report.Warnings = append(report.Warnings, fmt.Sprintf("HDD disk usage high: %.0f%%", sysInfo.HDDPercent))
|
|
if logger != nil {
|
|
logger.Printf("[WARN] [monitor] Disk (HDD) threshold breached: %.0f%% (limit: %d%%)", sysInfo.HDDPercent, cfg.Monitoring.Thresholds.DiskWarnPercent)
|
|
}
|
|
}
|
|
}
|
|
|
|
// 2. Memory usage
|
|
if sysInfo.MemPercent > 0 {
|
|
if sysInfo.MemPercent >= float64(cfg.Monitoring.Thresholds.MemoryWarnPercent) {
|
|
report.Warnings = append(report.Warnings, fmt.Sprintf("Memory usage high: %.0f%%", sysInfo.MemPercent))
|
|
if logger != nil {
|
|
logger.Printf("[WARN] [monitor] Memory threshold breached: %.0f%% (limit: %d%%)", sysInfo.MemPercent, cfg.Monitoring.Thresholds.MemoryWarnPercent)
|
|
}
|
|
if debug {
|
|
logger.Printf("[DEBUG] [monitor] Memory: WARN (%.0f%% >= %d%%)", sysInfo.MemPercent, cfg.Monitoring.Thresholds.MemoryWarnPercent)
|
|
}
|
|
} else {
|
|
report.Info = append(report.Info, fmt.Sprintf("Memory: %.0f%% used", sysInfo.MemPercent))
|
|
if debug {
|
|
logger.Printf("[DEBUG] [monitor] Memory: OK (%.0f%%)", sysInfo.MemPercent)
|
|
}
|
|
}
|
|
}
|
|
|
|
// 3. CPU usage
|
|
if sysInfo.CPUPercent > 0 {
|
|
if sysInfo.CPUPercent >= float64(cfg.Monitoring.Thresholds.CPUWarnPercent) {
|
|
report.Warnings = append(report.Warnings, fmt.Sprintf("CPU usage high: %.0f%%", sysInfo.CPUPercent))
|
|
if logger != nil {
|
|
logger.Printf("[WARN] [monitor] CPU threshold breached: %.0f%% (limit: %d%%)", sysInfo.CPUPercent, cfg.Monitoring.Thresholds.CPUWarnPercent)
|
|
}
|
|
if debug {
|
|
logger.Printf("[DEBUG] [monitor] CPU: WARN (%.0f%% >= %d%%)", sysInfo.CPUPercent, cfg.Monitoring.Thresholds.CPUWarnPercent)
|
|
}
|
|
} else {
|
|
report.Info = append(report.Info, fmt.Sprintf("CPU: %.0f%%", sysInfo.CPUPercent))
|
|
if debug {
|
|
logger.Printf("[DEBUG] [monitor] CPU: OK (%.0f%%)", sysInfo.CPUPercent)
|
|
}
|
|
}
|
|
}
|
|
|
|
// 4. Temperature
|
|
if sysInfo.TemperatureCelsius > 0 {
|
|
if sysInfo.TemperatureCelsius >= float64(cfg.Monitoring.Thresholds.TemperatureWarnCelsius) {
|
|
report.Warnings = append(report.Warnings, fmt.Sprintf("Temperature high: %.0f°C (%s)", sysInfo.TemperatureCelsius, sysInfo.TemperatureSource))
|
|
if logger != nil {
|
|
logger.Printf("[WARN] [monitor] Temperature threshold breached: %.0f°C (limit: %d°C)", sysInfo.TemperatureCelsius, cfg.Monitoring.Thresholds.TemperatureWarnCelsius)
|
|
}
|
|
if debug {
|
|
logger.Printf("[DEBUG] [monitor] Temperature: WARN (%.0f°C >= %d°C)", sysInfo.TemperatureCelsius, cfg.Monitoring.Thresholds.TemperatureWarnCelsius)
|
|
}
|
|
} else {
|
|
report.Info = append(report.Info, fmt.Sprintf("Temperature: %.0f°C", sysInfo.TemperatureCelsius))
|
|
if debug {
|
|
logger.Printf("[DEBUG] [monitor] Temperature: OK (%.0f°C)", sysInfo.TemperatureCelsius)
|
|
}
|
|
}
|
|
}
|
|
|
|
// 5. Docker health
|
|
if err := checkDocker(); err != nil {
|
|
report.Issues = append(report.Issues, fmt.Sprintf("Docker: %v", err))
|
|
if debug {
|
|
logger.Printf("[DEBUG] [monitor] Docker daemon: FAIL (%v)", err)
|
|
}
|
|
} else {
|
|
report.Info = append(report.Info, "Docker: reachable")
|
|
if debug {
|
|
logger.Printf("[DEBUG] [monitor] Docker daemon: OK")
|
|
}
|
|
}
|
|
|
|
// 6. Protected containers
|
|
if debug {
|
|
logger.Printf("[DEBUG] [monitor] Checking %d protected containers: %v", len(cfg.Stacks.Protected), cfg.Stacks.Protected)
|
|
}
|
|
missingProtected := checkProtectedContainers(cfg.Stacks.Protected)
|
|
for _, name := range missingProtected {
|
|
report.Issues = append(report.Issues, fmt.Sprintf("Protected container not running: %s", name))
|
|
}
|
|
if debug {
|
|
if len(missingProtected) > 0 {
|
|
logger.Printf("[DEBUG] [monitor] Protected containers missing: %v", missingProtected)
|
|
} else {
|
|
logger.Printf("[DEBUG] [monitor] All protected containers running")
|
|
}
|
|
}
|
|
|
|
// 7. Storage paths
|
|
storageIssues, storageWarnings := checkStoragePaths(storagePaths)
|
|
report.Issues = append(report.Issues, storageIssues...)
|
|
report.Warnings = append(report.Warnings, storageWarnings...)
|
|
|
|
// Determine status
|
|
if len(report.Issues) > 0 {
|
|
report.Status = "fail"
|
|
} else if len(report.Warnings) > 0 {
|
|
report.Status = "warn"
|
|
}
|
|
|
|
if logger != nil {
|
|
logger.Printf("[INFO] [monitor] Health check: status=%s", report.Status)
|
|
}
|
|
|
|
if debug {
|
|
logger.Printf("[DEBUG] [monitor] Final status: %s (issues=%d, warnings=%d, info=%d)",
|
|
report.Status, len(report.Issues), len(report.Warnings), len(report.Info))
|
|
}
|
|
|
|
return report
|
|
}
|
|
|
|
// FormatMessage returns a human-readable summary for healthcheck ping body.
|
|
func (r *HealthReport) FormatMessage() string {
|
|
var sb strings.Builder
|
|
|
|
sb.WriteString(fmt.Sprintf("Status: %s\n", strings.ToUpper(r.Status)))
|
|
sb.WriteString(fmt.Sprintf("Time: %s\n\n", r.Timestamp.Format("2006-01-02 15:04:05")))
|
|
|
|
if len(r.Issues) > 0 {
|
|
sb.WriteString("ISSUES:\n")
|
|
for _, issue := range r.Issues {
|
|
sb.WriteString(" - " + issue + "\n")
|
|
}
|
|
sb.WriteString("\n")
|
|
}
|
|
|
|
if len(r.Warnings) > 0 {
|
|
sb.WriteString("WARNINGS:\n")
|
|
for _, w := range r.Warnings {
|
|
sb.WriteString(" - " + w + "\n")
|
|
}
|
|
sb.WriteString("\n")
|
|
}
|
|
|
|
if len(r.Info) > 0 {
|
|
sb.WriteString("INFO:\n")
|
|
for _, info := range r.Info {
|
|
sb.WriteString(" - " + info + "\n")
|
|
}
|
|
}
|
|
|
|
return sb.String()
|
|
}
|
|
|
|
func checkDocker() error {
|
|
cmd := exec.Command("docker", "info", "--format", "{{.ServerVersion}}")
|
|
out, err := cmd.Output()
|
|
if err != nil {
|
|
return fmt.Errorf("docker not reachable: %v", err)
|
|
}
|
|
if len(strings.TrimSpace(string(out))) == 0 {
|
|
return fmt.Errorf("docker returned empty version")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func checkProtectedContainers(protected []string) []string {
|
|
var missing []string
|
|
for _, name := range protected {
|
|
cmd := exec.Command("docker", "inspect", "--format", "{{.State.Running}}", name)
|
|
out, err := cmd.Output()
|
|
if err != nil {
|
|
missing = append(missing, name)
|
|
continue
|
|
}
|
|
if strings.TrimSpace(string(out)) != "true" {
|
|
missing = append(missing, name)
|
|
}
|
|
}
|
|
return missing
|
|
}
|
|
|
|
func checkStoragePaths(paths []settings.StoragePath) (issues, warnings []string) {
|
|
for _, sp := range paths {
|
|
// Skip decommissioned paths — no longer in active use
|
|
if sp.Decommissioned {
|
|
continue
|
|
}
|
|
|
|
// Skip disconnected paths — handled by the storage watchdog
|
|
if sp.Disconnected {
|
|
warnings = append(warnings, fmt.Sprintf("Meghajtó leválasztva: %s (%s)", sp.Label, sp.Path))
|
|
continue
|
|
}
|
|
|
|
// Path accessible?
|
|
if _, err := os.Stat(sp.Path); err != nil {
|
|
warnings = append(warnings, fmt.Sprintf("Adattároló nem elérhető: %s", sp.Path))
|
|
continue
|
|
}
|
|
|
|
// Mount point check — warning, not issue (avoids false FAIL on demo/test environments)
|
|
if !system.IsMountPoint(sp.Path) {
|
|
warnings = append(warnings, fmt.Sprintf(
|
|
"Az adattároló (%s) nem külön meghajtón van — az adatok a rendszermeghajtóra íródnak", sp.Path))
|
|
}
|
|
|
|
// Disk usage
|
|
if di := system.GetDiskUsage(sp.Path); di != nil {
|
|
if di.UsedPercent >= 95 {
|
|
issues = append(issues, fmt.Sprintf("Adattároló majdnem megtelt: %s (%.0f%%)", sp.Path, di.UsedPercent))
|
|
} else if di.UsedPercent >= 90 {
|
|
warnings = append(warnings, fmt.Sprintf("Adattároló használat magas: %s (%.0f%%)", sp.Path, di.UsedPercent))
|
|
}
|
|
}
|
|
}
|
|
return
|
|
}
|