bdbe170a54
New storage watchdog monitors registered storage paths every 5s. On disconnect (3 consecutive probe failures), auto-stops affected apps, lazy-unmounts stale VFS entries, fires alerts/notifications/hub report. On reconnect (UUID detected), auto-remounts via fstab, cleans stale restic locks, offers app restart. Safe disconnect UI for USB drives: confirmation dialog, stop apps, sync, unmount. Disconnected state visible across all pages (dashboard, settings, backups, monitoring) with hatched red bars and badges. Backup guards skip disconnected drives. 22 files changed (1 new: monitor/watchdog.go), ~1500 lines added. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
204 lines
6.4 KiB
Go
204 lines
6.4 KiB
Go
package monitor
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"strings"
|
|
"time"
|
|
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/settings"
|
|
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
|
|
)
|
|
|
|
// HealthReport contains the results of a system health check.
|
|
type HealthReport struct {
|
|
Status string // "ok", "warn", "fail"
|
|
Issues []string // critical problems
|
|
Warnings []string // non-critical warnings
|
|
Info []string // informational items
|
|
Timestamp time.Time
|
|
}
|
|
|
|
// RunHealthCheck runs system checks and returns a diagnostic report.
|
|
func RunHealthCheck(cfg *config.Config, cpuCollector *system.CPUCollector, storagePaths []settings.StoragePath) *HealthReport {
|
|
report := &HealthReport{
|
|
Status: "ok",
|
|
Timestamp: time.Now(),
|
|
}
|
|
|
|
hddPath := cfg.Paths.HDDPath
|
|
if len(storagePaths) > 0 {
|
|
hddPath = storagePaths[0].Path
|
|
}
|
|
sysInfo := system.GetInfo(hddPath, cpuCollector)
|
|
|
|
// 1. Disk usage (SSD)
|
|
if sysInfo.DiskPercent > 0 {
|
|
if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskCritPercent) {
|
|
report.Issues = append(report.Issues, fmt.Sprintf("SSD disk usage critical: %.0f%%", sysInfo.DiskPercent))
|
|
} else if sysInfo.DiskPercent >= float64(cfg.Monitoring.Thresholds.DiskWarnPercent) {
|
|
report.Warnings = append(report.Warnings, fmt.Sprintf("SSD disk usage high: %.0f%%", sysInfo.DiskPercent))
|
|
} else {
|
|
report.Info = append(report.Info, fmt.Sprintf("SSD: %.0f%% used", sysInfo.DiskPercent))
|
|
}
|
|
}
|
|
|
|
// HDD disk usage
|
|
if sysInfo.HDDConfigured && sysInfo.HDDPercent > 0 {
|
|
if sysInfo.HDDPercent >= float64(cfg.Monitoring.Thresholds.DiskCritPercent) {
|
|
report.Issues = append(report.Issues, fmt.Sprintf("HDD disk usage critical: %.0f%%", sysInfo.HDDPercent))
|
|
} else if sysInfo.HDDPercent >= float64(cfg.Monitoring.Thresholds.DiskWarnPercent) {
|
|
report.Warnings = append(report.Warnings, fmt.Sprintf("HDD disk usage high: %.0f%%", sysInfo.HDDPercent))
|
|
}
|
|
}
|
|
|
|
// 2. Memory usage
|
|
if sysInfo.MemPercent > 0 {
|
|
if sysInfo.MemPercent >= float64(cfg.Monitoring.Thresholds.MemoryWarnPercent) {
|
|
report.Warnings = append(report.Warnings, fmt.Sprintf("Memory usage high: %.0f%%", sysInfo.MemPercent))
|
|
} else {
|
|
report.Info = append(report.Info, fmt.Sprintf("Memory: %.0f%% used", sysInfo.MemPercent))
|
|
}
|
|
}
|
|
|
|
// 3. CPU usage
|
|
if sysInfo.CPUPercent > 0 {
|
|
if sysInfo.CPUPercent >= float64(cfg.Monitoring.Thresholds.CPUWarnPercent) {
|
|
report.Warnings = append(report.Warnings, fmt.Sprintf("CPU usage high: %.0f%%", sysInfo.CPUPercent))
|
|
} else {
|
|
report.Info = append(report.Info, fmt.Sprintf("CPU: %.0f%%", sysInfo.CPUPercent))
|
|
}
|
|
}
|
|
|
|
// 4. Temperature
|
|
if sysInfo.TemperatureCelsius > 0 {
|
|
if sysInfo.TemperatureCelsius >= float64(cfg.Monitoring.Thresholds.TemperatureWarnCelsius) {
|
|
report.Warnings = append(report.Warnings, fmt.Sprintf("Temperature high: %.0f°C (%s)", sysInfo.TemperatureCelsius, sysInfo.TemperatureSource))
|
|
} else {
|
|
report.Info = append(report.Info, fmt.Sprintf("Temperature: %.0f°C", sysInfo.TemperatureCelsius))
|
|
}
|
|
}
|
|
|
|
// 5. Docker health
|
|
if err := checkDocker(); err != nil {
|
|
report.Issues = append(report.Issues, fmt.Sprintf("Docker: %v", err))
|
|
} else {
|
|
report.Info = append(report.Info, "Docker: reachable")
|
|
}
|
|
|
|
// 6. Protected containers
|
|
missingProtected := checkProtectedContainers(cfg.Stacks.Protected)
|
|
for _, name := range missingProtected {
|
|
report.Issues = append(report.Issues, fmt.Sprintf("Protected container not running: %s", name))
|
|
}
|
|
|
|
// 7. Storage paths
|
|
storageIssues, storageWarnings := checkStoragePaths(storagePaths)
|
|
report.Issues = append(report.Issues, storageIssues...)
|
|
report.Warnings = append(report.Warnings, storageWarnings...)
|
|
|
|
// Determine status
|
|
if len(report.Issues) > 0 {
|
|
report.Status = "fail"
|
|
} else if len(report.Warnings) > 0 {
|
|
report.Status = "warn"
|
|
}
|
|
|
|
return report
|
|
}
|
|
|
|
// FormatMessage returns a human-readable summary for healthcheck ping body.
|
|
func (r *HealthReport) FormatMessage() string {
|
|
var sb strings.Builder
|
|
|
|
sb.WriteString(fmt.Sprintf("Status: %s\n", strings.ToUpper(r.Status)))
|
|
sb.WriteString(fmt.Sprintf("Time: %s\n\n", r.Timestamp.Format("2006-01-02 15:04:05")))
|
|
|
|
if len(r.Issues) > 0 {
|
|
sb.WriteString("ISSUES:\n")
|
|
for _, issue := range r.Issues {
|
|
sb.WriteString(" - " + issue + "\n")
|
|
}
|
|
sb.WriteString("\n")
|
|
}
|
|
|
|
if len(r.Warnings) > 0 {
|
|
sb.WriteString("WARNINGS:\n")
|
|
for _, w := range r.Warnings {
|
|
sb.WriteString(" - " + w + "\n")
|
|
}
|
|
sb.WriteString("\n")
|
|
}
|
|
|
|
if len(r.Info) > 0 {
|
|
sb.WriteString("INFO:\n")
|
|
for _, info := range r.Info {
|
|
sb.WriteString(" - " + info + "\n")
|
|
}
|
|
}
|
|
|
|
return sb.String()
|
|
}
|
|
|
|
func checkDocker() error {
|
|
cmd := exec.Command("docker", "info", "--format", "{{.ServerVersion}}")
|
|
out, err := cmd.Output()
|
|
if err != nil {
|
|
return fmt.Errorf("docker not reachable: %v", err)
|
|
}
|
|
if len(strings.TrimSpace(string(out))) == 0 {
|
|
return fmt.Errorf("docker returned empty version")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func checkProtectedContainers(protected []string) []string {
|
|
var missing []string
|
|
for _, name := range protected {
|
|
cmd := exec.Command("docker", "inspect", "--format", "{{.State.Running}}", name)
|
|
out, err := cmd.Output()
|
|
if err != nil {
|
|
missing = append(missing, name)
|
|
continue
|
|
}
|
|
if strings.TrimSpace(string(out)) != "true" {
|
|
missing = append(missing, name)
|
|
}
|
|
}
|
|
return missing
|
|
}
|
|
|
|
func checkStoragePaths(paths []settings.StoragePath) (issues, warnings []string) {
|
|
for _, sp := range paths {
|
|
// Skip disconnected paths — handled by the storage watchdog
|
|
if sp.Disconnected {
|
|
warnings = append(warnings, fmt.Sprintf("Meghajtó leválasztva: %s (%s)", sp.Label, sp.Path))
|
|
continue
|
|
}
|
|
|
|
// Path accessible?
|
|
if _, err := os.Stat(sp.Path); err != nil {
|
|
warnings = append(warnings, fmt.Sprintf("Adattároló nem elérhető: %s", sp.Path))
|
|
continue
|
|
}
|
|
|
|
// Mount point check — warning, not issue (avoids false FAIL on demo/test environments)
|
|
if !system.IsMountPoint(sp.Path) {
|
|
warnings = append(warnings, fmt.Sprintf(
|
|
"Az adattároló (%s) nem külön meghajtón van — az adatok a rendszermeghajtóra íródnak", sp.Path))
|
|
}
|
|
|
|
// Disk usage
|
|
if di := system.GetDiskUsage(sp.Path); di != nil {
|
|
if di.UsedPercent >= 95 {
|
|
issues = append(issues, fmt.Sprintf("Adattároló majdnem megtelt: %s (%.0f%%)", sp.Path, di.UsedPercent))
|
|
} else if di.UsedPercent >= 90 {
|
|
warnings = append(warnings, fmt.Sprintf("Adattároló használat magas: %s (%.0f%%)", sp.Path, di.UsedPercent))
|
|
}
|
|
}
|
|
}
|
|
return
|
|
}
|