Files
deploy-felhom-compose/controller/internal/monitor/healthcheck.go
T
admin be7803c0ac v0.24.0 — Pre-testing observability: debug logging, diagnostic dump, startup self-test
- Add [DEBUG] logging across all modules (backup, storage, sync, selfupdate,
  monitor, notify, report, assets, setup) gated behind logging.level: "debug"
- Add /api/debug/dump endpoint returning full controller state JSON (debug only)
- Add startup self-test validating 9 subsystems (Docker, dirs, storage, hub,
  restic repos, metrics DB) with pass/warn/fail summary
- New packages: internal/selftest, internal/util
- Constructor/signature changes: debug bool params, logger params on
  RunHealthCheck and BuildReport, smart watchdog probe logging

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 18:32:26 +01:00

267 lines
8.8 KiB
Go

package monitor
import (
"fmt"
"log"
"os"
"os/exec"
"strings"
"time"
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
"gitea.dooplex.hu/admin/felhom-controller/internal/settings"
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
)
// HealthReport contains the results of a system health check.
type HealthReport struct {
Status string // "ok", "warn", "fail"
Issues []string // critical problems
Warnings []string // non-critical warnings
Info []string // informational items
Timestamp time.Time
}
// RunHealthCheck runs system checks and returns a diagnostic report.
func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, storagePaths []settings.StoragePath, logger *log.Logger) *HealthReport {
report := &HealthReport{
Status: "ok",
Timestamp: time.Now(),
}
debug := cfg.Logging.Level == "debug" && logger != nil
hddPath := cfg.Paths.HDDPath
if len(storagePaths) > 0 {
hddPath = storagePaths[0].Path
}
sysInfo := system.GetInfo(hddPath, cpuCollector)
if debug {
logger.Printf("[DEBUG] [HEALTH] Raw values: disk=%.1f%%, hdd=%.1f%% (configured=%v), mem=%.1f%% (%dMB/%dMB), cpu=%.1f%%, temp=%.1f°C (%s)",
sysInfo.DiskPercent, sysInfo.HDDPercent, sysInfo.HDDConfigured,
sysInfo.MemPercent, sysInfo.UsedMemMB, sysInfo.TotalMemMB,
sysInfo.CPUPercent, sysInfo.TemperatureCelsius, sysInfo.TemperatureSource)
}
// 1. Disk usage (SSD)
if sysInfo.DiskPercent > 0 {
if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskCritPercent) {
report.Issues = append(report.Issues, fmt.Sprintf("SSD disk usage critical: %.0f%%", sysInfo.DiskPercent))
if debug {
logger.Printf("[DEBUG] [HEALTH] SSD disk: CRITICAL (%.0f%% >= %d%%)", sysInfo.DiskPercent, cfg.Monitoring.Thresholds.DiskCritPercent)
}
} else if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskWarnPercent) {
report.Warnings = append(report.Warnings, fmt.Sprintf("SSD disk usage high: %.0f%%", sysInfo.DiskPercent))
if debug {
logger.Printf("[DEBUG] [HEALTH] SSD disk: WARN (%.0f%% >= %d%%)", sysInfo.DiskPercent, cfg.Monitoring.Thresholds.DiskWarnPercent)
}
} else {
report.Info = append(report.Info, fmt.Sprintf("SSD: %.0f%% used", sysInfo.DiskPercent))
if debug {
logger.Printf("[DEBUG] [HEALTH] SSD disk: OK (%.0f%%)", sysInfo.DiskPercent)
}
}
}
// HDD disk usage
if sysInfo.HDDConfigured && sysInfo.HDDPercent > 0 {
if sysInfo.HDDPercent >= float64(cfg.Monitoring.Thresholds.DiskCritPercent) {
report.Issues = append(report.Issues, fmt.Sprintf("HDD disk usage critical: %.0f%%", sysInfo.HDDPercent))
} else if sysInfo.HDDPercent >= float64(cfg.Monitoring.Thresholds.DiskWarnPercent) {
report.Warnings = append(report.Warnings, fmt.Sprintf("HDD disk usage high: %.0f%%", sysInfo.HDDPercent))
}
}
// 2. Memory usage
if sysInfo.MemPercent > 0 {
if sysInfo.MemPercent >= float64(cfg.Monitoring.Thresholds.MemoryWarnPercent) {
report.Warnings = append(report.Warnings, fmt.Sprintf("Memory usage high: %.0f%%", sysInfo.MemPercent))
if debug {
logger.Printf("[DEBUG] [HEALTH] Memory: WARN (%.0f%% >= %d%%)", sysInfo.MemPercent, cfg.Monitoring.Thresholds.MemoryWarnPercent)
}
} else {
report.Info = append(report.Info, fmt.Sprintf("Memory: %.0f%% used", sysInfo.MemPercent))
if debug {
logger.Printf("[DEBUG] [HEALTH] Memory: OK (%.0f%%)", sysInfo.MemPercent)
}
}
}
// 3. CPU usage
if sysInfo.CPUPercent > 0 {
if sysInfo.CPUPercent >= float64(cfg.Monitoring.Thresholds.CPUWarnPercent) {
report.Warnings = append(report.Warnings, fmt.Sprintf("CPU usage high: %.0f%%", sysInfo.CPUPercent))
if debug {
logger.Printf("[DEBUG] [HEALTH] CPU: WARN (%.0f%% >= %d%%)", sysInfo.CPUPercent, cfg.Monitoring.Thresholds.CPUWarnPercent)
}
} else {
report.Info = append(report.Info, fmt.Sprintf("CPU: %.0f%%", sysInfo.CPUPercent))
if debug {
logger.Printf("[DEBUG] [HEALTH] CPU: OK (%.0f%%)", sysInfo.CPUPercent)
}
}
}
// 4. Temperature
if sysInfo.TemperatureCelsius > 0 {
if sysInfo.TemperatureCelsius >= float64(cfg.Monitoring.Thresholds.TemperatureWarnCelsius) {
report.Warnings = append(report.Warnings, fmt.Sprintf("Temperature high: %.0f°C (%s)", sysInfo.TemperatureCelsius, sysInfo.TemperatureSource))
if debug {
logger.Printf("[DEBUG] [HEALTH] Temperature: WARN (%.0f°C >= %d°C)", sysInfo.TemperatureCelsius, cfg.Monitoring.Thresholds.TemperatureWarnCelsius)
}
} else {
report.Info = append(report.Info, fmt.Sprintf("Temperature: %.0f°C", sysInfo.TemperatureCelsius))
if debug {
logger.Printf("[DEBUG] [HEALTH] Temperature: OK (%.0f°C)", sysInfo.TemperatureCelsius)
}
}
}
// 5. Docker health
if err := checkDocker(); err != nil {
report.Issues = append(report.Issues, fmt.Sprintf("Docker: %v", err))
if debug {
logger.Printf("[DEBUG] [HEALTH] Docker daemon: FAIL (%v)", err)
}
} else {
report.Info = append(report.Info, "Docker: reachable")
if debug {
logger.Printf("[DEBUG] [HEALTH] Docker daemon: OK")
}
}
// 6. Protected containers
if debug {
logger.Printf("[DEBUG] [HEALTH] Checking %d protected containers: %v", len(cfg.Stacks.Protected), cfg.Stacks.Protected)
}
missingProtected := checkProtectedContainers(cfg.Stacks.Protected)
for _, name := range missingProtected {
report.Issues = append(report.Issues, fmt.Sprintf("Protected container not running: %s", name))
}
if debug {
if len(missingProtected) > 0 {
logger.Printf("[DEBUG] [HEALTH] Protected containers missing: %v", missingProtected)
} else {
logger.Printf("[DEBUG] [HEALTH] All protected containers running")
}
}
// 7. Storage paths
storageIssues, storageWarnings := checkStoragePaths(storagePaths)
report.Issues = append(report.Issues, storageIssues...)
report.Warnings = append(report.Warnings, storageWarnings...)
// Determine status
if len(report.Issues) > 0 {
report.Status = "fail"
} else if len(report.Warnings) > 0 {
report.Status = "warn"
}
if debug {
logger.Printf("[DEBUG] [HEALTH] Final status: %s (issues=%d, warnings=%d, info=%d)",
report.Status, len(report.Issues), len(report.Warnings), len(report.Info))
}
return report
}
// FormatMessage returns a human-readable summary for healthcheck ping body.
func (r *HealthReport) FormatMessage() string {
var sb strings.Builder
sb.WriteString(fmt.Sprintf("Status: %s\n", strings.ToUpper(r.Status)))
sb.WriteString(fmt.Sprintf("Time: %s\n\n", r.Timestamp.Format("2006-01-02 15:04:05")))
if len(r.Issues) > 0 {
sb.WriteString("ISSUES:\n")
for _, issue := range r.Issues {
sb.WriteString(" - " + issue + "\n")
}
sb.WriteString("\n")
}
if len(r.Warnings) > 0 {
sb.WriteString("WARNINGS:\n")
for _, w := range r.Warnings {
sb.WriteString(" - " + w + "\n")
}
sb.WriteString("\n")
}
if len(r.Info) > 0 {
sb.WriteString("INFO:\n")
for _, info := range r.Info {
sb.WriteString(" - " + info + "\n")
}
}
return sb.String()
}
func checkDocker() error {
cmd := exec.Command("docker", "info", "--format", "{{.ServerVersion}}")
out, err := cmd.Output()
if err != nil {
return fmt.Errorf("docker not reachable: %v", err)
}
if len(strings.TrimSpace(string(out))) == 0 {
return fmt.Errorf("docker returned empty version")
}
return nil
}
func checkProtectedContainers(protected []string) []string {
var missing []string
for _, name := range protected {
cmd := exec.Command("docker", "inspect", "--format", "{{.State.Running}}", name)
out, err := cmd.Output()
if err != nil {
missing = append(missing, name)
continue
}
if strings.TrimSpace(string(out)) != "true" {
missing = append(missing, name)
}
}
return missing
}
func checkStoragePaths(paths []settings.StoragePath) (issues, warnings []string) {
for _, sp := range paths {
// Skip decommissioned paths — no longer in active use
if sp.Decommissioned {
continue
}
// Skip disconnected paths — handled by the storage watchdog
if sp.Disconnected {
warnings = append(warnings, fmt.Sprintf("Meghajtó leválasztva: %s (%s)", sp.Label, sp.Path))
continue
}
// Path accessible?
if _, err := os.Stat(sp.Path); err != nil {
warnings = append(warnings, fmt.Sprintf("Adattároló nem elérhető: %s", sp.Path))
continue
}
// Mount point check — warning, not issue (avoids false FAIL on demo/test environments)
if !system.IsMountPoint(sp.Path) {
warnings = append(warnings, fmt.Sprintf(
"Az adattároló (%s) nem külön meghajtón van — az adatok a rendszermeghajtóra íródnak", sp.Path))
}
// Disk usage
if di := system.GetDiskUsage(sp.Path); di != nil {
if di.UsedPercent >= 95 {
issues = append(issues, fmt.Sprintf("Adattároló majdnem megtelt: %s (%.0f%%)", sp.Path, di.UsedPercent))
} else if di.UsedPercent >= 90 {
warnings = append(warnings, fmt.Sprintf("Adattároló használat magas: %s (%.0f%%)", sp.Path, di.UsedPercent))
}
}
}
return
}