Files
felhom-controller/controller/internal/monitor/healthcheck.go
T
admin bdbe170a54 feat: storage watchdog — USB disconnect detection, auto-stop, safe eject, auto-reconnect (v0.17.0)
New storage watchdog monitors registered storage paths every 5s. On disconnect
(3 consecutive probe failures), auto-stops affected apps, lazy-unmounts stale
VFS entries, fires alerts/notifications/hub report. On reconnect (UUID detected),
auto-remounts via fstab, cleans stale restic locks, offers app restart.

Safe disconnect UI for USB drives: confirmation dialog, stop apps, sync, unmount.
Disconnected state visible across all pages (dashboard, settings, backups, monitoring)
with hatched red bars and badges. Backup guards skip disconnected drives.

22 files changed (1 new: monitor/watchdog.go), ~1500 lines added.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 19:42:26 +01:00

204 lines
6.4 KiB
Go

package monitor
import (
"fmt"
"os"
"os/exec"
"strings"
"time"
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
"gitea.dooplex.hu/admin/felhom-controller/internal/settings"
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
)
// HealthReport contains the results of a system health check.
type HealthReport struct {
Status string // "ok", "warn", "fail"
Issues []string // critical problems
Warnings []string // non-critical warnings
Info []string // informational items
Timestamp time.Time
}
// RunHealthCheck runs system checks and returns a diagnostic report.
func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, storagePaths []settings.StoragePath) *HealthReport {
report := &HealthReport{
Status: "ok",
Timestamp: time.Now(),
}
hddPath := cfg.Paths.HDDPath
if len(storagePaths) > 0 {
hddPath = storagePaths[0].Path
}
sysInfo := system.GetInfo(hddPath, cpuCollector)
// 1. Disk usage (SSD)
if sysInfo.DiskPercent > 0 {
if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskCritPercent) {
report.Issues = append(report.Issues, fmt.Sprintf("SSD disk usage critical: %.0f%%", sysInfo.DiskPercent))
} else if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskWarnPercent) {
report.Warnings = append(report.Warnings, fmt.Sprintf("SSD disk usage high: %.0f%%", sysInfo.DiskPercent))
} else {
report.Info = append(report.Info, fmt.Sprintf("SSD: %.0f%% used", sysInfo.DiskPercent))
}
}
// HDD disk usage
if sysInfo.HDDConfigured && sysInfo.HDDPercent > 0 {
if sysInfo.HDDPercent >= float64(cfg.Monitoring.Thresholds.DiskCritPercent) {
report.Issues = append(report.Issues, fmt.Sprintf("HDD disk usage critical: %.0f%%", sysInfo.HDDPercent))
} else if sysInfo.HDDPercent >= float64(cfg.Monitoring.Thresholds.DiskWarnPercent) {
report.Warnings = append(report.Warnings, fmt.Sprintf("HDD disk usage high: %.0f%%", sysInfo.HDDPercent))
}
}
// 2. Memory usage
if sysInfo.MemPercent > 0 {
if sysInfo.MemPercent >= float64(cfg.Monitoring.Thresholds.MemoryWarnPercent) {
report.Warnings = append(report.Warnings, fmt.Sprintf("Memory usage high: %.0f%%", sysInfo.MemPercent))
} else {
report.Info = append(report.Info, fmt.Sprintf("Memory: %.0f%% used", sysInfo.MemPercent))
}
}
// 3. CPU usage
if sysInfo.CPUPercent > 0 {
if sysInfo.CPUPercent >= float64(cfg.Monitoring.Thresholds.CPUWarnPercent) {
report.Warnings = append(report.Warnings, fmt.Sprintf("CPU usage high: %.0f%%", sysInfo.CPUPercent))
} else {
report.Info = append(report.Info, fmt.Sprintf("CPU: %.0f%%", sysInfo.CPUPercent))
}
}
// 4. Temperature
if sysInfo.TemperatureCelsius > 0 {
if sysInfo.TemperatureCelsius >= float64(cfg.Monitoring.Thresholds.TemperatureWarnCelsius) {
report.Warnings = append(report.Warnings, fmt.Sprintf("Temperature high: %.0f°C (%s)", sysInfo.TemperatureCelsius, sysInfo.TemperatureSource))
} else {
report.Info = append(report.Info, fmt.Sprintf("Temperature: %.0f°C", sysInfo.TemperatureCelsius))
}
}
// 5. Docker health
if err := checkDocker(); err != nil {
report.Issues = append(report.Issues, fmt.Sprintf("Docker: %v", err))
} else {
report.Info = append(report.Info, "Docker: reachable")
}
// 6. Protected containers
missingProtected := checkProtectedContainers(cfg.Stacks.Protected)
for _, name := range missingProtected {
report.Issues = append(report.Issues, fmt.Sprintf("Protected container not running: %s", name))
}
// 7. Storage paths
storageIssues, storageWarnings := checkStoragePaths(storagePaths)
report.Issues = append(report.Issues, storageIssues...)
report.Warnings = append(report.Warnings, storageWarnings...)
// Determine status
if len(report.Issues) > 0 {
report.Status = "fail"
} else if len(report.Warnings) > 0 {
report.Status = "warn"
}
return report
}
// FormatMessage returns a human-readable summary for healthcheck ping body.
func (r *HealthReport) FormatMessage() string {
var sb strings.Builder
sb.WriteString(fmt.Sprintf("Status: %s\n", strings.ToUpper(r.Status)))
sb.WriteString(fmt.Sprintf("Time: %s\n\n", r.Timestamp.Format("2006-01-02 15:04:05")))
if len(r.Issues) > 0 {
sb.WriteString("ISSUES:\n")
for _, issue := range r.Issues {
sb.WriteString(" - " + issue + "\n")
}
sb.WriteString("\n")
}
if len(r.Warnings) > 0 {
sb.WriteString("WARNINGS:\n")
for _, w := range r.Warnings {
sb.WriteString(" - " + w + "\n")
}
sb.WriteString("\n")
}
if len(r.Info) > 0 {
sb.WriteString("INFO:\n")
for _, info := range r.Info {
sb.WriteString(" - " + info + "\n")
}
}
return sb.String()
}
func checkDocker() error {
cmd := exec.Command("docker", "info", "--format", "{{.ServerVersion}}")
out, err := cmd.Output()
if err != nil {
return fmt.Errorf("docker not reachable: %v", err)
}
if len(strings.TrimSpace(string(out))) == 0 {
return fmt.Errorf("docker returned empty version")
}
return nil
}
func checkProtectedContainers(protected []string) []string {
var missing []string
for _, name := range protected {
cmd := exec.Command("docker", "inspect", "--format", "{{.State.Running}}", name)
out, err := cmd.Output()
if err != nil {
missing = append(missing, name)
continue
}
if strings.TrimSpace(string(out)) != "true" {
missing = append(missing, name)
}
}
return missing
}
func checkStoragePaths(paths []settings.StoragePath) (issues, warnings []string) {
for _, sp := range paths {
// Skip disconnected paths — handled by the storage watchdog
if sp.Disconnected {
warnings = append(warnings, fmt.Sprintf("Meghajtó leválasztva: %s (%s)", sp.Label, sp.Path))
continue
}
// Path accessible?
if _, err := os.Stat(sp.Path); err != nil {
warnings = append(warnings, fmt.Sprintf("Adattároló nem elérhető: %s", sp.Path))
continue
}
// Mount point check — warning, not issue (avoids false FAIL on demo/test environments)
if !system.IsMountPoint(sp.Path) {
warnings = append(warnings, fmt.Sprintf(
"Az adattároló (%s) nem külön meghajtón van — az adatok a rendszermeghajtóra íródnak", sp.Path))
}
// Disk usage
if di := system.GetDiskUsage(sp.Path); di != nil {
if di.UsedPercent >= 95 {
issues = append(issues, fmt.Sprintf("Adattároló majdnem megtelt: %s (%.0f%%)", sp.Path, di.UsedPercent))
} else if di.UsedPercent >= 90 {
warnings = append(warnings, fmt.Sprintf("Adattároló használat magas: %s (%.0f%%)", sp.Path, di.UsedPercent))
}
}
}
return
}