feat: storage watchdog — USB disconnect detection, auto-stop, safe eject, auto-reconnect (v0.17.0)

New storage watchdog monitors registered storage paths every 5s. On disconnect
(3 consecutive probe failures), auto-stops affected apps, lazy-unmounts stale
VFS entries, fires alerts/notifications/hub report. On reconnect (UUID detected),
auto-remounts via fstab, cleans stale restic locks, offers app restart.

Safe disconnect UI for USB drives: confirmation dialog, stop apps, sync, unmount.
Disconnected state visible across all pages (dashboard, settings, backups, monitoring)
with hatched red bars and badges. Backup guards skip disconnected drives.

22 files changed (1 new: monitor/watchdog.go), ~1500 lines added.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-19 19:42:26 +01:00
parent 276be5a88e
commit bdbe170a54
22 changed files with 1537 additions and 57 deletions
@@ -172,6 +172,12 @@ func checkProtectedContainers(protected []string) []string {
func checkStoragePaths(paths []settings.StoragePath) (issues, warnings []string) {
for _, sp := range paths {
// Skip disconnected paths — handled by the storage watchdog
if sp.Disconnected {
warnings = append(warnings, fmt.Sprintf("Meghajtó leválasztva: %s (%s)", sp.Label, sp.Path))
continue
}
// Path accessible?
if _, err := os.Stat(sp.Path); err != nil {
warnings = append(warnings, fmt.Sprintf("Adattároló nem elérhető: %s", sp.Path))
+612
View File
@@ -0,0 +1,612 @@
package monitor
import (
"context"
"fmt"
"log"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"time"
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
"gitea.dooplex.hu/admin/felhom-controller/internal/notify"
"gitea.dooplex.hu/admin/felhom-controller/internal/settings"
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
)
const (
// probeThreshold is the number of consecutive probe failures before declaring disconnected.
probeThreshold = 3
// defaultProbeInterval is the normal probe interval for connected drives.
defaultProbeInterval = 5 * time.Second
// disconnectedProbeInterval is the slower probe interval for disconnected drives
// (checking for UUID reappearance, not I/O probing).
disconnectedProbeInterval = 30 * time.Second
// hostFstabPath is where the host's fstab is mounted inside the container.
hostFstabPath = "/host-fstab"
// hostDevUUIDPath is where the host's /dev/disk/by-uuid is accessible.
hostDevUUIDPath = "/host-dev/disk/by-uuid"
// primaryResticSubpath is the relative path to the primary restic repo under a drive.
primaryResticSubpath = "backups/primary/restic"
)
// WatchdogStackInfo holds minimal stack info for the watchdog.
type WatchdogStackInfo struct {
Name string
}
// WatchdogStackProvider provides stack operations needed by the watchdog.
// Defined here to avoid circular imports with the backup package.
type WatchdogStackProvider interface {
ListDeployedStacks() []WatchdogStackInfo
GetStackHDDPath(name string) string
StopStack(name string) error
StartStack(name string) error
}
// pathProbeState tracks in-memory probe state for a single storage path.
type pathProbeState struct {
consecutiveFailures int
lastStatus string // "connected", "disconnected"
lastProbeTime time.Time
probeInterval time.Duration
}
// StorageWatchdog monitors registered storage paths and reacts to disconnection/reconnection.
type StorageWatchdog struct {
settings *settings.Settings
stackProvider WatchdogStackProvider
notifier *notify.Notifier
cfg *config.Config
logger *log.Logger
// Callbacks to break import cycles — set via SetXxx methods after construction
alertRefresh func()
pushHubReport func()
unlockRepo func(ctx context.Context, repoPath string) error
mu sync.Mutex
pathState map[string]*pathProbeState
}
// NewStorageWatchdog creates a new storage watchdog.
func NewStorageWatchdog(
sett *settings.Settings,
stackProvider WatchdogStackProvider,
notifier *notify.Notifier,
cfg *config.Config,
logger *log.Logger,
) *StorageWatchdog {
return &StorageWatchdog{
settings: sett,
stackProvider: stackProvider,
notifier: notifier,
cfg: cfg,
logger: logger,
pathState: make(map[string]*pathProbeState),
}
}
// SetAlertRefresh sets the callback to trigger alert refresh.
func (w *StorageWatchdog) SetAlertRefresh(fn func()) {
w.alertRefresh = fn
}
// SetHubReportPusher sets the callback to push an immediate hub report.
func (w *StorageWatchdog) SetHubReportPusher(fn func()) {
w.pushHubReport = fn
}
// SetRepoUnlocker sets the callback to unlock a restic repo on reconnect.
func (w *StorageWatchdog) SetRepoUnlocker(fn func(ctx context.Context, repoPath string) error) {
w.unlockRepo = fn
}
// Check probes all registered storage paths and reacts to state changes.
// Called by the scheduler every 5 seconds.
func (w *StorageWatchdog) Check(ctx context.Context) error {
paths := w.settings.GetStoragePaths()
if len(paths) == 0 {
return nil
}
for _, sp := range paths {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
state := w.getOrCreateState(sp.Path)
// Rate-limit per-path probes
if time.Since(state.lastProbeTime) < state.probeInterval {
continue
}
state.lastProbeTime = time.Now()
if sp.Disconnected {
w.handleReconnectCheck(ctx, sp)
} else {
w.handleConnectedProbe(sp, state)
}
}
return nil
}
// getOrCreateState returns the in-memory probe state for a path, creating if needed.
func (w *StorageWatchdog) getOrCreateState(path string) *pathProbeState {
w.mu.Lock()
defer w.mu.Unlock()
if s, ok := w.pathState[path]; ok {
return s
}
s := &pathProbeState{
lastStatus: "connected",
probeInterval: defaultProbeInterval,
}
w.pathState[path] = s
return s
}
// handleConnectedProbe probes a connected drive and triggers disconnect if needed.
func (w *StorageWatchdog) handleConnectedProbe(sp settings.StoragePath, state *pathProbeState) {
result := system.ProbeStoragePath(sp.Path)
if result.Status == system.ProbeConnected {
if state.consecutiveFailures > 0 {
w.logger.Printf("[DEBUG] [STORAGE] Probe recovered for %s after %d failures", sp.Path, state.consecutiveFailures)
}
state.consecutiveFailures = 0
state.lastStatus = "connected"
return
}
state.consecutiveFailures++
w.logger.Printf("[WARN] [STORAGE] Probe failed for %s (%d/%d): %v",
sp.Path, state.consecutiveFailures, probeThreshold, result.Err)
if state.consecutiveFailures >= probeThreshold {
w.handleDisconnect(sp, state, result)
}
}
// handleDisconnect reacts to a confirmed drive disconnection.
func (w *StorageWatchdog) handleDisconnect(sp settings.StoragePath, state *pathProbeState, probe system.ProbeResult) {
label := sp.Label
if label == "" {
label = sp.Path
}
w.logger.Printf("[ERROR] [STORAGE] Drive disconnected: %s (%s)", sp.Path, label)
// 1. Find and stop affected stacks
stoppedStacks := w.stopAffectedStacks(sp.Path)
// 2. Mark disconnected in settings (persists to settings.json)
if err := w.settings.SetDisconnected(sp.Path, true, stoppedStacks); err != nil {
w.logger.Printf("[ERROR] [STORAGE] Failed to mark disconnected: %v", err)
}
// 3. Lazy unmount stale mount (if probe timed out — mount is likely hanging)
if probe.Status == system.ProbeTimeout {
w.lazyUnmount(sp.Path)
}
// 4. Update in-memory state
state.lastStatus = "disconnected"
state.probeInterval = disconnectedProbeInterval
state.consecutiveFailures = 0
// 5. Trigger alert refresh
if w.alertRefresh != nil {
w.alertRefresh()
}
// 6. Send notification
w.notifier.NotifyStorageDisconnected(label, stoppedStacks)
// 7. Push immediate hub report
if w.pushHubReport != nil {
go w.pushHubReport()
}
}
// handleReconnectCheck checks if a disconnected drive has been reconnected.
func (w *StorageWatchdog) handleReconnectCheck(ctx context.Context, sp settings.StoragePath) {
// Find the UUID for this path from fstab
// For attach-wizard drives, the UUID is on the raw mount, not the bind mount
mountPath := sp.Path
rawPath, isAttachWizard := system.HasFelhomRawMount(hostFstabPath, sp.Path)
if isAttachWizard {
mountPath = rawPath
}
uuid := system.ParseFstabUUID(hostFstabPath, mountPath)
if uuid == "" {
// No UUID in fstab — can't detect reconnection automatically
return
}
// Check if the UUID block device is present
uuidPath := filepath.Join(hostDevUUIDPath, uuid)
if _, err := os.Stat(uuidPath); err != nil {
return // Drive not reconnected yet
}
label := sp.Label
if label == "" {
label = sp.Path
}
w.logger.Printf("[INFO] [STORAGE] Drive reconnected (UUID found), attempting remount: %s (%s)", sp.Path, label)
// Attempt remount
if err := w.remount(sp.Path, rawPath, isAttachWizard); err != nil {
w.logger.Printf("[ERROR] [STORAGE] Remount failed for %s: %v", sp.Path, err)
return // Try again next cycle
}
// Verify with a probe
verifyResult := system.ProbeStoragePath(sp.Path)
if verifyResult.Status != system.ProbeConnected {
w.logger.Printf("[ERROR] [STORAGE] Post-remount probe failed for %s: %v", sp.Path, verifyResult.Err)
return
}
w.logger.Printf("[INFO] [STORAGE] Drive successfully remounted: %s (%s)", sp.Path, label)
// Clean stale restic locks
w.cleanResticLocks(ctx, sp.Path)
// Validate stopped stacks — filter to only actually stopped ones
filteredStacks := w.filterStoppedStacks(sp.StoppedStacks)
// Clear disconnected but preserve StoppedStacks for the restart UI
if err := w.settings.SetDisconnected(sp.Path, false, filteredStacks); err != nil {
w.logger.Printf("[ERROR] [STORAGE] Failed to clear disconnected: %v", err)
}
// Update in-memory state
state := w.getOrCreateState(sp.Path)
state.lastStatus = "connected"
state.probeInterval = defaultProbeInterval
state.consecutiveFailures = 0
// Trigger alert refresh
if w.alertRefresh != nil {
w.alertRefresh()
}
// Send notification
w.notifier.NotifyStorageReconnected(label)
// Push immediate hub report
if w.pushHubReport != nil {
go w.pushHubReport()
}
}
// stopAffectedStacks stops all deployed stacks whose HDD_PATH matches the disconnected drive.
func (w *StorageWatchdog) stopAffectedStacks(drivePath string) []string {
if w.stackProvider == nil {
return nil
}
var stopped []string
cleanDrive := filepath.Clean(drivePath)
for _, stack := range w.stackProvider.ListDeployedStacks() {
hddPath := w.stackProvider.GetStackHDDPath(stack.Name)
if hddPath == "" {
continue
}
cleanHDD := filepath.Clean(hddPath)
if cleanHDD != cleanDrive && !strings.HasPrefix(cleanHDD, cleanDrive+"/") {
continue
}
// Don't stop protected stacks
if w.cfg.IsProtectedStack(stack.Name) {
w.logger.Printf("[WARN] [STORAGE] Skipping protected stack: %s", stack.Name)
continue
}
w.logger.Printf("[INFO] [STORAGE] Stopping stack %s (drive disconnected: %s)", stack.Name, drivePath)
if err := w.stackProvider.StopStack(stack.Name); err != nil {
w.logger.Printf("[ERROR] [STORAGE] Failed to stop stack %s: %v", stack.Name, err)
continue // Don't add to stopped list if stop failed
}
stopped = append(stopped, stack.Name)
}
if len(stopped) > 0 {
w.logger.Printf("[INFO] [STORAGE] Stopped %d stack(s) due to drive disconnect: %v", len(stopped), stopped)
}
return stopped
}
// lazyUnmount performs a lazy unmount of a path and its raw mount (if attach-wizard).
func (w *StorageWatchdog) lazyUnmount(path string) {
// For attach-wizard, unmount bind first, then raw
rawPath, isAttachWizard := system.HasFelhomRawMount(hostFstabPath, path)
// Unmount the bind/main path
cmd := exec.Command("umount", "-l", path)
if out, err := cmd.CombinedOutput(); err != nil {
w.logger.Printf("[WARN] [STORAGE] umount -l %s: %v (%s)", path, err, strings.TrimSpace(string(out)))
} else {
w.logger.Printf("[INFO] [STORAGE] Lazy unmounted: %s", path)
}
// Then unmount the raw path if it's an attach-wizard drive
if isAttachWizard && rawPath != "" {
cmd = exec.Command("umount", "-l", rawPath)
if out, err := cmd.CombinedOutput(); err != nil {
w.logger.Printf("[WARN] [STORAGE] umount -l %s: %v (%s)", rawPath, err, strings.TrimSpace(string(out)))
} else {
w.logger.Printf("[INFO] [STORAGE] Lazy unmounted raw: %s", rawPath)
}
}
}
// remount attempts to remount a storage path using fstab entries.
func (w *StorageWatchdog) remount(path, rawPath string, isAttachWizard bool) error {
// Clean any stale mount entries first
exec.Command("umount", "-l", path).Run()
if isAttachWizard && rawPath != "" {
exec.Command("umount", "-l", rawPath).Run()
}
if isAttachWizard && rawPath != "" {
// Mount raw first, then bind
cmd := exec.Command("mount", "-T", hostFstabPath, rawPath)
if out, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("mount raw %s: %v (%s)", rawPath, err, strings.TrimSpace(string(out)))
}
w.logger.Printf("[INFO] [STORAGE] Mounted raw: %s", rawPath)
cmd = exec.Command("mount", "-T", hostFstabPath, path)
if out, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("mount bind %s: %v (%s)", path, err, strings.TrimSpace(string(out)))
}
w.logger.Printf("[INFO] [STORAGE] Mounted bind: %s", path)
} else {
cmd := exec.Command("mount", "-T", hostFstabPath, path)
if out, err := cmd.CombinedOutput(); err != nil {
return fmt.Errorf("mount %s: %v (%s)", path, err, strings.TrimSpace(string(out)))
}
w.logger.Printf("[INFO] [STORAGE] Mounted: %s", path)
}
return nil
}
// cleanResticLocks runs restic unlock on the primary repo for a drive path.
func (w *StorageWatchdog) cleanResticLocks(ctx context.Context, drivePath string) {
repoPath := filepath.Join(drivePath, primaryResticSubpath)
locksDir := filepath.Join(repoPath, "locks")
entries, err := os.ReadDir(locksDir)
if err != nil || len(entries) == 0 {
return // No locks dir or no lock files
}
w.logger.Printf("[INFO] [STORAGE] Found %d restic lock file(s) in %s, running unlock", len(entries), repoPath)
if w.unlockRepo != nil {
if err := w.unlockRepo(ctx, repoPath); err != nil {
w.logger.Printf("[WARN] [STORAGE] Restic unlock failed for %s: %v", repoPath, err)
}
}
}
// filterStoppedStacks validates that stacks in the list still exist as deployed stacks.
func (w *StorageWatchdog) filterStoppedStacks(stackNames []string) []string {
if w.stackProvider == nil || len(stackNames) == 0 {
return nil
}
deployed := make(map[string]bool)
for _, s := range w.stackProvider.ListDeployedStacks() {
deployed[s.Name] = true
}
var result []string
for _, name := range stackNames {
if deployed[name] {
result = append(result, name)
}
}
return result
}
// SafeDisconnect performs a safe disconnect of a storage path.
// Stops affected apps, syncs filesystem, and unmounts the drive.
func (w *StorageWatchdog) SafeDisconnect(ctx context.Context, path string) (stoppedStacks []string, err error) {
sp := w.findStoragePath(path)
if sp == nil {
return nil, fmt.Errorf("storage path %q not found", path)
}
if sp.Disconnected {
return nil, fmt.Errorf("drive already disconnected")
}
label := sp.Label
if label == "" {
label = sp.Path
}
w.logger.Printf("[INFO] [STORAGE] Safe disconnect requested: %s (%s)", path, label)
// 1. Stop affected stacks
stoppedStacks = w.stopAffectedStacks(path)
// 2. Sync filesystem
exec.Command("sync").Run()
// 3. Unmount
rawPath, isAttachWizard := system.HasFelhomRawMount(hostFstabPath, path)
// Unmount bind/main
cmd := exec.Command("umount", path)
if out, umountErr := cmd.CombinedOutput(); umountErr != nil {
// Try lazy unmount as fallback
w.logger.Printf("[WARN] [STORAGE] umount %s failed, trying lazy: %v", path, umountErr)
cmd = exec.Command("umount", "-l", path)
if out, umountErr = cmd.CombinedOutput(); umountErr != nil {
return stoppedStacks, fmt.Errorf("umount %s failed: %v (%s)", path, umountErr, strings.TrimSpace(string(out)))
}
}
// Unmount raw if attach-wizard
if isAttachWizard && rawPath != "" {
cmd = exec.Command("umount", rawPath)
if out, umountErr := cmd.CombinedOutput(); umountErr != nil {
cmd = exec.Command("umount", "-l", rawPath)
if out, umountErr = cmd.CombinedOutput(); umountErr != nil {
w.logger.Printf("[WARN] [STORAGE] umount raw %s failed: %v (%s)", rawPath, umountErr, strings.TrimSpace(string(out)))
}
}
}
// 4. Mark disconnected
if setErr := w.settings.SetDisconnected(path, true, stoppedStacks); setErr != nil {
w.logger.Printf("[ERROR] [STORAGE] Failed to mark disconnected: %v", setErr)
}
// 5. Update in-memory state
state := w.getOrCreateState(path)
state.lastStatus = "disconnected"
state.probeInterval = disconnectedProbeInterval
state.consecutiveFailures = 0
// 6. Trigger alert refresh
if w.alertRefresh != nil {
w.alertRefresh()
}
// 7. Notify and push hub report
w.notifier.Notify("storage_safe_disconnect", "info",
fmt.Sprintf("Meghajtó biztonságosan leválasztva: %s", label), "")
if w.pushHubReport != nil {
go w.pushHubReport()
}
w.logger.Printf("[INFO] [STORAGE] Safe disconnect completed: %s — drive can be removed", path)
return stoppedStacks, nil
}
// Reconnect attempts to remount a disconnected storage path.
func (w *StorageWatchdog) Reconnect(ctx context.Context, path string) (stoppedStacks []string, err error) {
sp := w.findStoragePath(path)
if sp == nil {
return nil, fmt.Errorf("storage path %q not found", path)
}
if !sp.Disconnected {
return nil, fmt.Errorf("drive is not disconnected")
}
label := sp.Label
if label == "" {
label = sp.Path
}
// Check UUID availability
mountPath := sp.Path
rawPath, isAttachWizard := system.HasFelhomRawMount(hostFstabPath, sp.Path)
if isAttachWizard {
mountPath = rawPath
}
uuid := system.ParseFstabUUID(hostFstabPath, mountPath)
if uuid != "" {
uuidPath := filepath.Join(hostDevUUIDPath, uuid)
if _, statErr := os.Stat(uuidPath); statErr != nil {
return nil, fmt.Errorf("drive not detected (UUID %s not found) — ensure the drive is physically connected", uuid)
}
}
// Attempt remount
if mountErr := w.remount(path, rawPath, isAttachWizard); mountErr != nil {
return nil, fmt.Errorf("mount failed: %w", mountErr)
}
// Verify
verifyResult := system.ProbeStoragePath(path)
if verifyResult.Status != system.ProbeConnected {
return nil, fmt.Errorf("mount appeared to succeed but probe failed: %v", verifyResult.Err)
}
// Clean restic locks
w.cleanResticLocks(ctx, path)
// Validate stopped stacks
filteredStacks := w.filterStoppedStacks(sp.StoppedStacks)
// Clear disconnected, preserve stopped stacks for restart UI
if setErr := w.settings.SetDisconnected(path, false, filteredStacks); setErr != nil {
w.logger.Printf("[ERROR] [STORAGE] Failed to clear disconnected: %v", setErr)
}
// Update in-memory state
state := w.getOrCreateState(path)
state.lastStatus = "connected"
state.probeInterval = defaultProbeInterval
state.consecutiveFailures = 0
// Trigger alert refresh
if w.alertRefresh != nil {
w.alertRefresh()
}
// Notify
w.notifier.NotifyStorageReconnected(label)
if w.pushHubReport != nil {
go w.pushHubReport()
}
w.logger.Printf("[INFO] [STORAGE] Reconnect completed: %s", path)
return filteredStacks, nil
}
// RestartStoppedApps restarts apps that were auto-stopped due to a drive disconnect.
func (w *StorageWatchdog) RestartStoppedApps(path string) (started, failed []string) {
sp := w.findStoragePath(path)
if sp == nil || sp.Disconnected {
return nil, nil
}
stacks := w.settings.GetStoppedStacks(path)
if len(stacks) == 0 {
return nil, nil
}
for _, name := range stacks {
w.logger.Printf("[INFO] [STORAGE] Starting stack %s (drive reconnected: %s)", name, path)
if err := w.stackProvider.StartStack(name); err != nil {
w.logger.Printf("[ERROR] [STORAGE] Failed to start stack %s: %v", name, err)
failed = append(failed, name)
} else {
started = append(started, name)
}
}
// Clear stopped stacks list
if err := w.settings.ClearStoppedStacks(path); err != nil {
w.logger.Printf("[ERROR] [STORAGE] Failed to clear stopped stacks: %v", err)
}
return started, failed
}
// findStoragePath returns the storage path entry for a given path, or nil.
func (w *StorageWatchdog) findStoragePath(path string) *settings.StoragePath {
for _, sp := range w.settings.GetStoragePaths() {
if sp.Path == path {
return &sp
}
}
return nil
}