Files
deploy-felhom-compose/controller/internal/monitor/healthcheck.go
T
admin 99bf3ca7a8 feat: drive migration & Tier 2 restic deprecation (v0.18.0)
Phase 1: Deprecate restic as Tier 2 method (rsync only), auto-migrate on startup
Phase 2: Enhanced per-app migration with backup awareness, DB dump copy, auto-cleanup
Phase 3: Full drive migration with decommissioned state, rollback support, wizard UI
Phase 4: Hub report includes decommissioned drive state

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 21:49:14 +01:00

209 lines
6.5 KiB
Go

package monitor
import (
"fmt"
"os"
"os/exec"
"strings"
"time"
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
"gitea.dooplex.hu/admin/felhom-controller/internal/settings"
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
)
// HealthReport contains the results of a system health check.
type HealthReport struct {
Status string // "ok", "warn", "fail"
Issues []string // critical problems
Warnings []string // non-critical warnings
Info []string // informational items
Timestamp time.Time
}
// RunHealthCheck runs system checks and returns a diagnostic report.
func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, storagePaths []settings.StoragePath) *HealthReport {
report := &HealthReport{
Status: "ok",
Timestamp: time.Now(),
}
hddPath := cfg.Paths.HDDPath
if len(storagePaths) > 0 {
hddPath = storagePaths[0].Path
}
sysInfo := system.GetInfo(hddPath, cpuCollector)
// 1. Disk usage (SSD)
if sysInfo.DiskPercent > 0 {
if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskCritPercent) {
report.Issues = append(report.Issues, fmt.Sprintf("SSD disk usage critical: %.0f%%", sysInfo.DiskPercent))
} else if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskWarnPercent) {
report.Warnings = append(report.Warnings, fmt.Sprintf("SSD disk usage high: %.0f%%", sysInfo.DiskPercent))
} else {
report.Info = append(report.Info, fmt.Sprintf("SSD: %.0f%% used", sysInfo.DiskPercent))
}
}
// HDD disk usage
if sysInfo.HDDConfigured && sysInfo.HDDPercent > 0 {
if sysInfo.HDDPercent >= float64(cfg.Monitoring.Thresholds.DiskCritPercent) {
report.Issues = append(report.Issues, fmt.Sprintf("HDD disk usage critical: %.0f%%", sysInfo.HDDPercent))
} else if sysInfo.HDDPercent >= float64(cfg.Monitoring.Thresholds.DiskWarnPercent) {
report.Warnings = append(report.Warnings, fmt.Sprintf("HDD disk usage high: %.0f%%", sysInfo.HDDPercent))
}
}
// 2. Memory usage
if sysInfo.MemPercent > 0 {
if sysInfo.MemPercent >= float64(cfg.Monitoring.Thresholds.MemoryWarnPercent) {
report.Warnings = append(report.Warnings, fmt.Sprintf("Memory usage high: %.0f%%", sysInfo.MemPercent))
} else {
report.Info = append(report.Info, fmt.Sprintf("Memory: %.0f%% used", sysInfo.MemPercent))
}
}
// 3. CPU usage
if sysInfo.CPUPercent > 0 {
if sysInfo.CPUPercent >= float64(cfg.Monitoring.Thresholds.CPUWarnPercent) {
report.Warnings = append(report.Warnings, fmt.Sprintf("CPU usage high: %.0f%%", sysInfo.CPUPercent))
} else {
report.Info = append(report.Info, fmt.Sprintf("CPU: %.0f%%", sysInfo.CPUPercent))
}
}
// 4. Temperature
if sysInfo.TemperatureCelsius > 0 {
if sysInfo.TemperatureCelsius >= float64(cfg.Monitoring.Thresholds.TemperatureWarnCelsius) {
report.Warnings = append(report.Warnings, fmt.Sprintf("Temperature high: %.0f°C (%s)", sysInfo.TemperatureCelsius, sysInfo.TemperatureSource))
} else {
report.Info = append(report.Info, fmt.Sprintf("Temperature: %.0f°C", sysInfo.TemperatureCelsius))
}
}
// 5. Docker health
if err := checkDocker(); err != nil {
report.Issues = append(report.Issues, fmt.Sprintf("Docker: %v", err))
} else {
report.Info = append(report.Info, "Docker: reachable")
}
// 6. Protected containers
missingProtected := checkProtectedContainers(cfg.Stacks.Protected)
for _, name := range missingProtected {
report.Issues = append(report.Issues, fmt.Sprintf("Protected container not running: %s", name))
}
// 7. Storage paths
storageIssues, storageWarnings := checkStoragePaths(storagePaths)
report.Issues = append(report.Issues, storageIssues...)
report.Warnings = append(report.Warnings, storageWarnings...)
// Determine status
if len(report.Issues) > 0 {
report.Status = "fail"
} else if len(report.Warnings) > 0 {
report.Status = "warn"
}
return report
}
// FormatMessage returns a human-readable summary for healthcheck ping body.
func (r *HealthReport) FormatMessage() string {
var sb strings.Builder
sb.WriteString(fmt.Sprintf("Status: %s\n", strings.ToUpper(r.Status)))
sb.WriteString(fmt.Sprintf("Time: %s\n\n", r.Timestamp.Format("2006-01-02 15:04:05")))
if len(r.Issues) > 0 {
sb.WriteString("ISSUES:\n")
for _, issue := range r.Issues {
sb.WriteString(" - " + issue + "\n")
}
sb.WriteString("\n")
}
if len(r.Warnings) > 0 {
sb.WriteString("WARNINGS:\n")
for _, w := range r.Warnings {
sb.WriteString(" - " + w + "\n")
}
sb.WriteString("\n")
}
if len(r.Info) > 0 {
sb.WriteString("INFO:\n")
for _, info := range r.Info {
sb.WriteString(" - " + info + "\n")
}
}
return sb.String()
}
func checkDocker() error {
cmd := exec.Command("docker", "info", "--format", "{{.ServerVersion}}")
out, err := cmd.Output()
if err != nil {
return fmt.Errorf("docker not reachable: %v", err)
}
if len(strings.TrimSpace(string(out))) == 0 {
return fmt.Errorf("docker returned empty version")
}
return nil
}
func checkProtectedContainers(protected []string) []string {
var missing []string
for _, name := range protected {
cmd := exec.Command("docker", "inspect", "--format", "{{.State.Running}}", name)
out, err := cmd.Output()
if err != nil {
missing = append(missing, name)
continue
}
if strings.TrimSpace(string(out)) != "true" {
missing = append(missing, name)
}
}
return missing
}
func checkStoragePaths(paths []settings.StoragePath) (issues, warnings []string) {
for _, sp := range paths {
// Skip decommissioned paths — no longer in active use
if sp.Decommissioned {
continue
}
// Skip disconnected paths — handled by the storage watchdog
if sp.Disconnected {
warnings = append(warnings, fmt.Sprintf("Meghajtó leválasztva: %s (%s)", sp.Label, sp.Path))
continue
}
// Path accessible?
if _, err := os.Stat(sp.Path); err != nil {
warnings = append(warnings, fmt.Sprintf("Adattároló nem elérhető: %s", sp.Path))
continue
}
// Mount point check — warning, not issue (avoids false FAIL on demo/test environments)
if !system.IsMountPoint(sp.Path) {
warnings = append(warnings, fmt.Sprintf(
"Az adattároló (%s) nem külön meghajtón van — az adatok a rendszermeghajtóra íródnak", sp.Path))
}
// Disk usage
if di := system.GetDiskUsage(sp.Path); di != nil {
if di.UsedPercent >= 95 {
issues = append(issues, fmt.Sprintf("Adattároló majdnem megtelt: %s (%.0f%%)", sp.Path, di.UsedPercent))
} else if di.UsedPercent >= 90 {
warnings = append(warnings, fmt.Sprintf("Adattároló használat magas: %s (%.0f%%)", sp.Path, di.UsedPercent))
}
}
}
return
}