slice 8C Phase B.2 + C.1/C.2: retire disk subsystem + rewire disk mgmt to agent
Retired (~12.3k LOC): internal/storage/* (scan/format/attach/migrate/safety), backup restic/crossdrive/restore_drives/disk_layout/local_infra/restore_scan/ paths + restore_app, report/infra_backup*/infra_pull, setup/scanner, monitor/watchdog+pinger, web/storage_handlers+handler_restore. Surgically split backup.Manager to app-data only (DB dumps + volume tars + app restore; dropped restic + cross-drive + snapshot history). Fixed router/main/web wiring. Added agent-backed disk API (web/agent_disk_handlers.go): /api/disks list/ assign/eject/format proxying agentapi; data-bearing format refusal -> HTTP 409 'operator authorization required'. report/config_pull.go keeps the setup fresh-install config download. go build + go test green. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -1,120 +0,0 @@
|
||||
package monitor
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||||
)
|
||||
|
||||
// Pinger sends health check pings to a Healthchecks.io-compatible server.
|
||||
type Pinger struct {
|
||||
baseURL string
|
||||
httpClient *http.Client
|
||||
logger *log.Logger
|
||||
enabled bool
|
||||
debug bool
|
||||
}
|
||||
|
||||
// NewPinger creates a new Pinger from monitoring config.
|
||||
func NewPinger(cfg *config.MonitoringConfig, logger *log.Logger) *Pinger {
|
||||
return &Pinger{
|
||||
baseURL: strings.TrimRight(cfg.HealthchecksBase, "/"),
|
||||
httpClient: &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
},
|
||||
logger: logger,
|
||||
enabled: cfg.Enabled,
|
||||
}
|
||||
}
|
||||
|
||||
// SetDebug enables or disables debug logging for the pinger.
|
||||
func (p *Pinger) SetDebug(debug bool) {
|
||||
p.debug = debug
|
||||
}
|
||||
|
||||
// Ping sends a success signal with optional diagnostic body.
|
||||
func (p *Pinger) Ping(uuid string, body string) error {
|
||||
if p.debug {
|
||||
p.logger.Printf("[DEBUG] [pinger] Ping uuid=%s body_len=%d", uuid, len(body))
|
||||
}
|
||||
return p.send(uuid, "", body)
|
||||
}
|
||||
|
||||
// Fail sends a failure signal with diagnostic body.
|
||||
func (p *Pinger) Fail(uuid string, body string) error {
|
||||
if p.debug {
|
||||
p.logger.Printf("[DEBUG] [pinger] Fail uuid=%s body=%q", uuid, body)
|
||||
}
|
||||
return p.send(uuid, "/fail", body)
|
||||
}
|
||||
|
||||
// Start sends a "job started" signal (for duration tracking).
|
||||
func (p *Pinger) Start(uuid string) error {
|
||||
if p.debug {
|
||||
p.logger.Printf("[DEBUG] [pinger] Start uuid=%s", uuid)
|
||||
}
|
||||
return p.send(uuid, "/start", "")
|
||||
}
|
||||
|
||||
func (p *Pinger) send(uuid, suffix, body string) error {
|
||||
if !p.enabled {
|
||||
return nil
|
||||
}
|
||||
|
||||
if uuid == "" || strings.HasPrefix(uuid, "CHANGEME") {
|
||||
return nil
|
||||
}
|
||||
|
||||
url := fmt.Sprintf("%s/ping/%s%s", p.baseURL, uuid, suffix)
|
||||
if p.debug {
|
||||
p.logger.Printf("[DEBUG] [pinger] send url=%s", url)
|
||||
}
|
||||
|
||||
var lastErr error
|
||||
for attempt := 0; attempt < 3; attempt++ {
|
||||
if attempt > 0 {
|
||||
if p.debug {
|
||||
p.logger.Printf("[DEBUG] [pinger] retry attempt=%d uuid=%s", attempt+1, uuid)
|
||||
}
|
||||
time.Sleep(2 * time.Second)
|
||||
}
|
||||
|
||||
var bodyReader io.Reader
|
||||
if body != "" {
|
||||
bodyReader = strings.NewReader(body)
|
||||
}
|
||||
|
||||
req, err := http.NewRequest(http.MethodPost, url, bodyReader)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue
|
||||
}
|
||||
|
||||
resp, err := p.httpClient.Do(req)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
continue
|
||||
}
|
||||
resp.Body.Close()
|
||||
|
||||
if p.debug {
|
||||
p.logger.Printf("[DEBUG] [pinger] response status=%d uuid=%s", resp.StatusCode, uuid)
|
||||
}
|
||||
|
||||
if resp.StatusCode >= 200 && resp.StatusCode < 300 {
|
||||
if p.debug {
|
||||
p.logger.Printf("[DEBUG] [pinger] success uuid=%s", uuid)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
lastErr = fmt.Errorf("HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
p.logger.Printf("[WARN] [monitor] Health ping failed after 3 attempts (%s): %v", uuid, lastErr)
|
||||
return nil // Never let ping failures affect the caller
|
||||
}
|
||||
@@ -1,902 +0,0 @@
|
||||
package monitor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/config"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/notify"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/settings"
|
||||
"gitea.dooplex.hu/admin/felhom-controller/internal/system"
|
||||
)
|
||||
|
||||
const (
|
||||
// probeThreshold is the number of consecutive probe failures before declaring disconnected.
|
||||
probeThreshold = 3
|
||||
|
||||
// defaultProbeInterval is the normal probe interval for connected drives.
|
||||
defaultProbeInterval = 5 * time.Second
|
||||
|
||||
// disconnectedProbeInterval is the slower probe interval for disconnected drives
|
||||
// (checking for UUID reappearance, not I/O probing).
|
||||
disconnectedProbeInterval = 30 * time.Second
|
||||
|
||||
// hostFstabPath is where the host's fstab is mounted inside the container.
|
||||
hostFstabPath = "/host-fstab"
|
||||
|
||||
// hostDevUUIDPath is where the host's /dev/disk/by-uuid is accessible.
|
||||
hostDevUUIDPath = "/host-dev/disk/by-uuid"
|
||||
|
||||
// primaryResticSubpath is the relative path to the primary restic repo under a drive.
|
||||
primaryResticSubpath = "backups/primary/restic"
|
||||
)
|
||||
|
||||
// WatchdogStackInfo holds minimal stack info for the watchdog.
|
||||
type WatchdogStackInfo struct {
|
||||
Name string
|
||||
}
|
||||
|
||||
// WatchdogStackProvider provides stack operations needed by the watchdog.
|
||||
// Defined here to avoid circular imports with the backup package.
|
||||
type WatchdogStackProvider interface {
|
||||
ListDeployedStacks() []WatchdogStackInfo
|
||||
GetStackHDDPath(name string) string
|
||||
StopStack(name string) error
|
||||
StartStack(name string) error
|
||||
}
|
||||
|
||||
// pathProbeState tracks in-memory probe state for a single storage path.
|
||||
type pathProbeState struct {
|
||||
mu sync.Mutex
|
||||
consecutiveFailures int
|
||||
lastStatus string // "connected", "disconnected"
|
||||
lastProbeTime time.Time
|
||||
probeInterval time.Duration
|
||||
// Debug counters for summary logging
|
||||
probeCount int
|
||||
probeOKCount int
|
||||
lastSummaryTime time.Time
|
||||
totalLatency time.Duration
|
||||
}
|
||||
|
||||
// StorageWatchdog monitors registered storage paths and reacts to disconnection/reconnection.
|
||||
type StorageWatchdog struct {
|
||||
settings *settings.Settings
|
||||
stackProvider WatchdogStackProvider
|
||||
notifier *notify.Notifier
|
||||
cfg *config.Config
|
||||
logger *log.Logger
|
||||
|
||||
// Callbacks to break import cycles — set via SetXxx methods after construction
|
||||
alertRefresh func()
|
||||
pushHubReport func()
|
||||
unlockRepo func(ctx context.Context, repoPath string) error
|
||||
|
||||
mu sync.Mutex
|
||||
pathState map[string]*pathProbeState
|
||||
|
||||
// Debug simulation state
|
||||
simulatedMu sync.RWMutex
|
||||
simulatedPaths map[string]bool
|
||||
}
|
||||
|
||||
// NewStorageWatchdog creates a new storage watchdog.
|
||||
func NewStorageWatchdog(
|
||||
sett *settings.Settings,
|
||||
stackProvider WatchdogStackProvider,
|
||||
notifier *notify.Notifier,
|
||||
cfg *config.Config,
|
||||
logger *log.Logger,
|
||||
) *StorageWatchdog {
|
||||
return &StorageWatchdog{
|
||||
settings: sett,
|
||||
stackProvider: stackProvider,
|
||||
notifier: notifier,
|
||||
cfg: cfg,
|
||||
logger: logger,
|
||||
pathState: make(map[string]*pathProbeState),
|
||||
simulatedPaths: make(map[string]bool),
|
||||
}
|
||||
}
|
||||
|
||||
// isDebug returns true if the logging level is set to "debug".
|
||||
func (w *StorageWatchdog) isDebug() bool { return w.cfg.Logging.Level == "debug" }
|
||||
|
||||
// SetAlertRefresh sets the callback to trigger alert refresh.
|
||||
func (w *StorageWatchdog) SetAlertRefresh(fn func()) {
|
||||
w.alertRefresh = fn
|
||||
}
|
||||
|
||||
// SetHubReportPusher sets the callback to push an immediate hub report.
|
||||
func (w *StorageWatchdog) SetHubReportPusher(fn func()) {
|
||||
w.pushHubReport = fn
|
||||
}
|
||||
|
||||
// SetRepoUnlocker sets the callback to unlock a restic repo on reconnect.
|
||||
func (w *StorageWatchdog) SetRepoUnlocker(fn func(ctx context.Context, repoPath string) error) {
|
||||
w.unlockRepo = fn
|
||||
}
|
||||
|
||||
// Check probes all registered storage paths and reacts to state changes.
|
||||
// Called by the scheduler every 5 seconds.
|
||||
func (w *StorageWatchdog) Check(ctx context.Context) error {
|
||||
paths := w.settings.GetStoragePaths()
|
||||
if len(paths) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, sp := range paths {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
state := w.getOrCreateState(sp.Path)
|
||||
|
||||
// Rate-limit per-path probes
|
||||
state.mu.Lock()
|
||||
if time.Since(state.lastProbeTime) < state.probeInterval {
|
||||
state.mu.Unlock()
|
||||
continue
|
||||
}
|
||||
state.lastProbeTime = time.Now()
|
||||
state.mu.Unlock()
|
||||
|
||||
// Skip decommissioned drives entirely — no apps reference them
|
||||
if sp.Decommissioned {
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip simulated-disconnected paths (handled by debug UI)
|
||||
if w.isSimulated(sp.Path) {
|
||||
continue
|
||||
}
|
||||
|
||||
if sp.Disconnected {
|
||||
w.handleReconnectCheck(ctx, sp)
|
||||
} else {
|
||||
w.handleConnectedProbe(sp, state)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// getOrCreateState returns the in-memory probe state for a path, creating if needed.
|
||||
func (w *StorageWatchdog) getOrCreateState(path string) *pathProbeState {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
if s, ok := w.pathState[path]; ok {
|
||||
return s
|
||||
}
|
||||
s := &pathProbeState{
|
||||
lastStatus: "connected",
|
||||
probeInterval: defaultProbeInterval,
|
||||
}
|
||||
w.pathState[path] = s
|
||||
return s
|
||||
}
|
||||
|
||||
// handleConnectedProbe probes a connected drive and triggers disconnect if needed.
|
||||
func (w *StorageWatchdog) handleConnectedProbe(sp settings.StoragePath, state *pathProbeState) {
|
||||
probeStart := time.Now()
|
||||
result := system.ProbeStoragePath(sp.Path)
|
||||
probeLatency := time.Since(probeStart)
|
||||
|
||||
state.mu.Lock()
|
||||
defer state.mu.Unlock()
|
||||
|
||||
if w.isDebug() {
|
||||
state.probeCount++
|
||||
state.totalLatency += probeLatency
|
||||
}
|
||||
|
||||
if result.Status == system.ProbeConnected {
|
||||
if state.consecutiveFailures > 0 {
|
||||
w.logger.Printf("[DEBUG] [storage] Probe recovered for %s after %d failures", sp.Path, state.consecutiveFailures)
|
||||
}
|
||||
state.consecutiveFailures = 0
|
||||
state.lastStatus = "connected"
|
||||
if w.isDebug() {
|
||||
state.probeOKCount++
|
||||
// Every 60 probes (~5 minutes at 5s interval): emit summary
|
||||
if state.probeCount >= 60 {
|
||||
avgLatency := state.totalLatency / time.Duration(state.probeCount)
|
||||
w.logger.Printf("[DEBUG] [storage] Storage watchdog: %s — %d/%d probes OK (last 5m, avg %dms)",
|
||||
sp.Path, state.probeOKCount, state.probeCount, avgLatency.Milliseconds())
|
||||
state.probeCount = 0
|
||||
state.probeOKCount = 0
|
||||
state.totalLatency = 0
|
||||
state.lastSummaryTime = time.Now()
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
state.consecutiveFailures++
|
||||
|
||||
// Debug: log immediately on unexpected failure (was connected, now failing)
|
||||
if w.isDebug() && state.lastStatus == "connected" {
|
||||
w.logger.Printf("[DEBUG] [storage] Storage probe failed for %s (%d/%d before disconnect): %v",
|
||||
sp.Path, state.consecutiveFailures, probeThreshold, result.Err)
|
||||
}
|
||||
|
||||
w.logger.Printf("[WARN] [storage] Probe failed for %s (%d/%d): %v",
|
||||
sp.Path, state.consecutiveFailures, probeThreshold, result.Err)
|
||||
|
||||
if state.consecutiveFailures >= probeThreshold {
|
||||
// Release state.mu before calling handleDisconnect (which re-acquires it
|
||||
// internally). Re-acquire afterwards so the deferred Unlock stays balanced.
|
||||
// Wrap in a func to guarantee re-lock even if handleDisconnect panics.
|
||||
func() {
|
||||
state.mu.Unlock()
|
||||
defer state.mu.Lock()
|
||||
w.handleDisconnect(sp, state, result)
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
// handleDisconnect reacts to a confirmed drive disconnection.
|
||||
func (w *StorageWatchdog) handleDisconnect(sp settings.StoragePath, state *pathProbeState, probe system.ProbeResult) {
|
||||
label := sp.Label
|
||||
if label == "" {
|
||||
label = sp.Path
|
||||
}
|
||||
w.logger.Printf("[ERROR] [storage] Drive disconnected: %s (%s)", sp.Path, label)
|
||||
|
||||
// 1. Find and stop affected stacks
|
||||
stoppedStacks := w.stopAffectedStacks(sp.Path)
|
||||
|
||||
// 2. Mark disconnected in settings (persists to settings.json)
|
||||
if err := w.settings.SetDisconnected(sp.Path, true, stoppedStacks); err != nil {
|
||||
w.logger.Printf("[ERROR] [storage] Failed to mark disconnected: %v", err)
|
||||
}
|
||||
|
||||
// 3. Lazy unmount stale mount (if probe timed out — mount is likely hanging)
|
||||
if probe.Status == system.ProbeTimeout {
|
||||
w.lazyUnmount(sp.Path)
|
||||
}
|
||||
|
||||
// 4. Update in-memory state
|
||||
state.mu.Lock()
|
||||
state.lastStatus = "disconnected"
|
||||
state.probeInterval = disconnectedProbeInterval
|
||||
state.consecutiveFailures = 0
|
||||
state.mu.Unlock()
|
||||
|
||||
// 5. Trigger alert refresh
|
||||
if w.alertRefresh != nil {
|
||||
w.alertRefresh()
|
||||
}
|
||||
|
||||
// 6. Send notification
|
||||
w.notifier.NotifyStorageDisconnected(label, stoppedStacks)
|
||||
|
||||
// 7. Push immediate hub report
|
||||
if w.pushHubReport != nil {
|
||||
go w.pushHubReport()
|
||||
}
|
||||
}
|
||||
|
||||
// handleReconnectCheck checks if a disconnected drive has been reconnected.
|
||||
func (w *StorageWatchdog) handleReconnectCheck(ctx context.Context, sp settings.StoragePath) {
|
||||
// Find the UUID for this path from fstab
|
||||
// For attach-wizard drives, the UUID is on the raw mount, not the bind mount
|
||||
mountPath := sp.Path
|
||||
rawPath, isAttachWizard := system.HasFelhomRawMount(hostFstabPath, sp.Path)
|
||||
if isAttachWizard {
|
||||
mountPath = rawPath
|
||||
}
|
||||
|
||||
uuid := system.ParseFstabUUID(hostFstabPath, mountPath)
|
||||
if uuid == "" {
|
||||
// No UUID in fstab — can't detect reconnection automatically
|
||||
return
|
||||
}
|
||||
|
||||
if w.isDebug() {
|
||||
w.logger.Printf("[DEBUG] [storage] Reconnect check for %s: UUID=%s, mountPath=%s, isAttachWizard=%v",
|
||||
sp.Path, uuid, mountPath, isAttachWizard)
|
||||
}
|
||||
|
||||
// Check if the UUID block device is present
|
||||
uuidPath := filepath.Join(hostDevUUIDPath, uuid)
|
||||
if _, err := os.Stat(uuidPath); err != nil {
|
||||
return // Drive not reconnected yet
|
||||
}
|
||||
|
||||
label := sp.Label
|
||||
if label == "" {
|
||||
label = sp.Path
|
||||
}
|
||||
w.logger.Printf("[INFO] [storage] Drive reconnected (UUID found), attempting remount: %s (%s)", sp.Path, label)
|
||||
|
||||
if w.isDebug() {
|
||||
w.logger.Printf("[DEBUG] [storage] UUID %s found at %s, mounting %s (raw=%s, attachWizard=%v)",
|
||||
uuid, uuidPath, sp.Path, rawPath, isAttachWizard)
|
||||
}
|
||||
|
||||
// Attempt remount
|
||||
if err := w.remount(sp.Path, rawPath, isAttachWizard); err != nil {
|
||||
w.logger.Printf("[ERROR] [storage] Remount failed for %s: %v", sp.Path, err)
|
||||
return // Try again next cycle
|
||||
}
|
||||
|
||||
// Verify with a probe
|
||||
verifyResult := system.ProbeStoragePath(sp.Path)
|
||||
if verifyResult.Status != system.ProbeConnected {
|
||||
w.logger.Printf("[ERROR] [storage] Post-remount probe failed for %s: %v", sp.Path, verifyResult.Err)
|
||||
if w.isDebug() {
|
||||
w.logger.Printf("[DEBUG] [storage] Post-mount verification failed for %s: status=%v, err=%v",
|
||||
sp.Path, verifyResult.Status, verifyResult.Err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if w.isDebug() {
|
||||
w.logger.Printf("[DEBUG] [storage] Post-mount verification succeeded for %s", sp.Path)
|
||||
}
|
||||
|
||||
w.logger.Printf("[INFO] [storage] Drive successfully remounted: %s (%s)", sp.Path, label)
|
||||
|
||||
// Clean stale restic locks
|
||||
w.cleanResticLocks(ctx, sp.Path)
|
||||
|
||||
// Validate stopped stacks — filter to only actually stopped ones
|
||||
filteredStacks := w.filterStoppedStacks(sp.StoppedStacks)
|
||||
|
||||
// Clear disconnected but preserve StoppedStacks for the restart UI
|
||||
if err := w.settings.SetDisconnected(sp.Path, false, filteredStacks); err != nil {
|
||||
w.logger.Printf("[ERROR] [storage] Failed to clear disconnected: %v", err)
|
||||
}
|
||||
|
||||
// Update in-memory state
|
||||
state := w.getOrCreateState(sp.Path)
|
||||
state.mu.Lock()
|
||||
state.lastStatus = "connected"
|
||||
state.probeInterval = defaultProbeInterval
|
||||
state.consecutiveFailures = 0
|
||||
state.mu.Unlock()
|
||||
|
||||
// Trigger alert refresh
|
||||
if w.alertRefresh != nil {
|
||||
w.alertRefresh()
|
||||
}
|
||||
|
||||
// Send notification
|
||||
w.notifier.NotifyStorageReconnected(label)
|
||||
|
||||
// Push immediate hub report
|
||||
if w.pushHubReport != nil {
|
||||
go w.pushHubReport()
|
||||
}
|
||||
}
|
||||
|
||||
// stopAffectedStacks stops all deployed stacks whose HDD_PATH matches the disconnected drive.
|
||||
func (w *StorageWatchdog) stopAffectedStacks(drivePath string) []string {
|
||||
if w.stackProvider == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var stopped []string
|
||||
cleanDrive := filepath.Clean(drivePath)
|
||||
|
||||
for _, stack := range w.stackProvider.ListDeployedStacks() {
|
||||
hddPath := w.stackProvider.GetStackHDDPath(stack.Name)
|
||||
if hddPath == "" {
|
||||
continue
|
||||
}
|
||||
cleanHDD := filepath.Clean(hddPath)
|
||||
if cleanHDD != cleanDrive && !strings.HasPrefix(cleanHDD, cleanDrive+"/") {
|
||||
continue
|
||||
}
|
||||
|
||||
// Don't stop protected stacks
|
||||
if w.cfg.IsProtectedStack(stack.Name) {
|
||||
w.logger.Printf("[WARN] [storage] Skipping protected stack: %s", stack.Name)
|
||||
continue
|
||||
}
|
||||
|
||||
w.logger.Printf("[INFO] [storage] Stopping stack %s (drive disconnected: %s)", stack.Name, drivePath)
|
||||
if err := w.stackProvider.StopStack(stack.Name); err != nil {
|
||||
w.logger.Printf("[ERROR] [storage] Failed to stop stack %s: %v", stack.Name, err)
|
||||
continue // Don't add to stopped list if stop failed
|
||||
}
|
||||
stopped = append(stopped, stack.Name)
|
||||
}
|
||||
|
||||
if len(stopped) > 0 {
|
||||
w.logger.Printf("[INFO] [storage] Stopped %d stack(s) due to drive disconnect: %v", len(stopped), stopped)
|
||||
}
|
||||
return stopped
|
||||
}
|
||||
|
||||
// lazyUnmount performs a lazy unmount of a path and its raw mount (if attach-wizard).
|
||||
func (w *StorageWatchdog) lazyUnmount(path string) {
|
||||
// For attach-wizard, unmount bind first, then raw
|
||||
rawPath, isAttachWizard := system.HasFelhomRawMount(hostFstabPath, path)
|
||||
|
||||
// Unmount the bind/main path
|
||||
cmd := exec.Command("umount", "-l", path)
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
w.logger.Printf("[WARN] [storage] umount -l %s: %v (%s)", path, err, strings.TrimSpace(string(out)))
|
||||
} else {
|
||||
w.logger.Printf("[INFO] [storage] Lazy unmounted: %s", path)
|
||||
}
|
||||
|
||||
// Then unmount the raw path if it's an attach-wizard drive
|
||||
if isAttachWizard && rawPath != "" {
|
||||
cmd = exec.Command("umount", "-l", rawPath)
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
w.logger.Printf("[WARN] [storage] umount -l %s: %v (%s)", rawPath, err, strings.TrimSpace(string(out)))
|
||||
} else {
|
||||
w.logger.Printf("[INFO] [storage] Lazy unmounted raw: %s", rawPath)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// remount attempts to remount a storage path using fstab entries.
|
||||
func (w *StorageWatchdog) remount(path, rawPath string, isAttachWizard bool) error {
|
||||
// Clean any stale mount entries first
|
||||
exec.Command("umount", "-l", path).Run()
|
||||
if isAttachWizard && rawPath != "" {
|
||||
exec.Command("umount", "-l", rawPath).Run()
|
||||
}
|
||||
|
||||
if isAttachWizard && rawPath != "" {
|
||||
// Mount raw first, then bind
|
||||
cmd := exec.Command("mount", "-T", hostFstabPath, rawPath)
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("mount raw %s: %v (%s)", rawPath, err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
w.logger.Printf("[INFO] [storage] Mounted raw: %s", rawPath)
|
||||
|
||||
cmd = exec.Command("mount", "-T", hostFstabPath, path)
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("mount bind %s: %v (%s)", path, err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
w.logger.Printf("[INFO] [storage] Mounted bind: %s", path)
|
||||
} else {
|
||||
cmd := exec.Command("mount", "-T", hostFstabPath, path)
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("mount %s: %v (%s)", path, err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
w.logger.Printf("[INFO] [storage] Mounted: %s", path)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// cleanResticLocks runs restic unlock on the primary repo for a drive path.
|
||||
func (w *StorageWatchdog) cleanResticLocks(ctx context.Context, drivePath string) {
|
||||
repoPath := filepath.Join(drivePath, primaryResticSubpath)
|
||||
locksDir := filepath.Join(repoPath, "locks")
|
||||
entries, err := os.ReadDir(locksDir)
|
||||
if err != nil || len(entries) == 0 {
|
||||
return // No locks dir or no lock files
|
||||
}
|
||||
|
||||
w.logger.Printf("[INFO] [storage] Found %d restic lock file(s) in %s, running unlock", len(entries), repoPath)
|
||||
|
||||
if w.unlockRepo != nil {
|
||||
if err := w.unlockRepo(ctx, repoPath); err != nil {
|
||||
w.logger.Printf("[WARN] [storage] Restic unlock failed for %s: %v", repoPath, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// filterStoppedStacks validates that stacks in the list still exist as deployed stacks.
|
||||
func (w *StorageWatchdog) filterStoppedStacks(stackNames []string) []string {
|
||||
if w.stackProvider == nil || len(stackNames) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
deployed := make(map[string]bool)
|
||||
for _, s := range w.stackProvider.ListDeployedStacks() {
|
||||
deployed[s.Name] = true
|
||||
}
|
||||
|
||||
var result []string
|
||||
for _, name := range stackNames {
|
||||
if deployed[name] {
|
||||
result = append(result, name)
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// SafeDisconnect performs a safe disconnect of a storage path.
|
||||
// Stops affected apps, syncs filesystem, and unmounts the drive.
|
||||
func (w *StorageWatchdog) SafeDisconnect(ctx context.Context, path string) (stoppedStacks []string, err error) {
|
||||
sp := w.findStoragePath(path)
|
||||
if sp == nil {
|
||||
return nil, fmt.Errorf("storage path %q not found", path)
|
||||
}
|
||||
if sp.Disconnected {
|
||||
return nil, fmt.Errorf("drive already disconnected")
|
||||
}
|
||||
if sp.Decommissioned {
|
||||
return nil, fmt.Errorf("drive is decommissioned — no apps to stop")
|
||||
}
|
||||
|
||||
label := sp.Label
|
||||
if label == "" {
|
||||
label = sp.Path
|
||||
}
|
||||
w.logger.Printf("[INFO] [storage] Safe disconnect requested: %s (%s)", path, label)
|
||||
|
||||
// 1. Stop affected stacks
|
||||
stoppedStacks = w.stopAffectedStacks(path)
|
||||
|
||||
// 2. Sync filesystem
|
||||
exec.Command("sync").Run()
|
||||
|
||||
// 3. Unmount
|
||||
rawPath, isAttachWizard := system.HasFelhomRawMount(hostFstabPath, path)
|
||||
|
||||
// Unmount bind/main
|
||||
cmd := exec.Command("umount", path)
|
||||
if out, umountErr := cmd.CombinedOutput(); umountErr != nil {
|
||||
// Try lazy unmount as fallback
|
||||
w.logger.Printf("[WARN] [storage] umount %s failed, trying lazy: %v", path, umountErr)
|
||||
cmd = exec.Command("umount", "-l", path)
|
||||
if out, umountErr = cmd.CombinedOutput(); umountErr != nil {
|
||||
return stoppedStacks, fmt.Errorf("umount %s failed: %v (%s)", path, umountErr, strings.TrimSpace(string(out)))
|
||||
}
|
||||
}
|
||||
|
||||
// Unmount raw if attach-wizard
|
||||
if isAttachWizard && rawPath != "" {
|
||||
cmd = exec.Command("umount", rawPath)
|
||||
if out, umountErr := cmd.CombinedOutput(); umountErr != nil {
|
||||
cmd = exec.Command("umount", "-l", rawPath)
|
||||
if out, umountErr = cmd.CombinedOutput(); umountErr != nil {
|
||||
w.logger.Printf("[WARN] [storage] umount raw %s failed: %v (%s)", rawPath, umountErr, strings.TrimSpace(string(out)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Mark disconnected
|
||||
if setErr := w.settings.SetDisconnected(path, true, stoppedStacks); setErr != nil {
|
||||
w.logger.Printf("[ERROR] [storage] Failed to mark disconnected: %v", setErr)
|
||||
}
|
||||
|
||||
// 5. Update in-memory state
|
||||
state := w.getOrCreateState(path)
|
||||
state.mu.Lock()
|
||||
state.lastStatus = "disconnected"
|
||||
state.probeInterval = disconnectedProbeInterval
|
||||
state.consecutiveFailures = 0
|
||||
state.mu.Unlock()
|
||||
|
||||
// 6. Trigger alert refresh
|
||||
if w.alertRefresh != nil {
|
||||
w.alertRefresh()
|
||||
}
|
||||
|
||||
// 7. Notify and push hub report
|
||||
w.notifier.Notify("storage_safe_disconnect", "info",
|
||||
fmt.Sprintf("Meghajtó biztonságosan leválasztva: %s", label), "")
|
||||
if w.pushHubReport != nil {
|
||||
go w.pushHubReport()
|
||||
}
|
||||
|
||||
w.logger.Printf("[INFO] [storage] Safe disconnect completed: %s — drive can be removed", path)
|
||||
return stoppedStacks, nil
|
||||
}
|
||||
|
||||
// Reconnect attempts to remount a disconnected storage path.
|
||||
func (w *StorageWatchdog) Reconnect(ctx context.Context, path string) (stoppedStacks []string, err error) {
|
||||
sp := w.findStoragePath(path)
|
||||
if sp == nil {
|
||||
return nil, fmt.Errorf("storage path %q not found", path)
|
||||
}
|
||||
if !sp.Disconnected {
|
||||
return nil, fmt.Errorf("drive is not disconnected")
|
||||
}
|
||||
|
||||
label := sp.Label
|
||||
if label == "" {
|
||||
label = sp.Path
|
||||
}
|
||||
|
||||
// Check UUID availability
|
||||
mountPath := sp.Path
|
||||
rawPath, isAttachWizard := system.HasFelhomRawMount(hostFstabPath, sp.Path)
|
||||
if isAttachWizard {
|
||||
mountPath = rawPath
|
||||
}
|
||||
uuid := system.ParseFstabUUID(hostFstabPath, mountPath)
|
||||
if uuid != "" {
|
||||
uuidPath := filepath.Join(hostDevUUIDPath, uuid)
|
||||
if _, statErr := os.Stat(uuidPath); statErr != nil {
|
||||
return nil, fmt.Errorf("drive not detected (UUID %s not found) — ensure the drive is physically connected", uuid)
|
||||
}
|
||||
}
|
||||
|
||||
// Attempt remount
|
||||
if mountErr := w.remount(path, rawPath, isAttachWizard); mountErr != nil {
|
||||
return nil, fmt.Errorf("mount failed: %w", mountErr)
|
||||
}
|
||||
|
||||
// Verify
|
||||
verifyResult := system.ProbeStoragePath(path)
|
||||
if verifyResult.Status != system.ProbeConnected {
|
||||
return nil, fmt.Errorf("mount appeared to succeed but probe failed: %v", verifyResult.Err)
|
||||
}
|
||||
|
||||
// Clean restic locks
|
||||
w.cleanResticLocks(ctx, path)
|
||||
|
||||
// Validate stopped stacks
|
||||
filteredStacks := w.filterStoppedStacks(sp.StoppedStacks)
|
||||
|
||||
// Clear disconnected, preserve stopped stacks for restart UI
|
||||
if setErr := w.settings.SetDisconnected(path, false, filteredStacks); setErr != nil {
|
||||
w.logger.Printf("[ERROR] [storage] Failed to clear disconnected: %v", setErr)
|
||||
}
|
||||
|
||||
// Update in-memory state
|
||||
state := w.getOrCreateState(path)
|
||||
state.mu.Lock()
|
||||
state.lastStatus = "connected"
|
||||
state.probeInterval = defaultProbeInterval
|
||||
state.consecutiveFailures = 0
|
||||
state.mu.Unlock()
|
||||
|
||||
// Trigger alert refresh
|
||||
if w.alertRefresh != nil {
|
||||
w.alertRefresh()
|
||||
}
|
||||
|
||||
// Notify
|
||||
w.notifier.NotifyStorageReconnected(label)
|
||||
if w.pushHubReport != nil {
|
||||
go w.pushHubReport()
|
||||
}
|
||||
|
||||
w.logger.Printf("[INFO] [storage] Reconnect completed: %s", path)
|
||||
return filteredStacks, nil
|
||||
}
|
||||
|
||||
// RestartStoppedApps restarts apps that were auto-stopped due to a drive disconnect.
|
||||
func (w *StorageWatchdog) RestartStoppedApps(path string) (started, failed []string) {
|
||||
sp := w.findStoragePath(path)
|
||||
if sp == nil || sp.Disconnected {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
stacks := w.settings.GetStoppedStacks(path)
|
||||
if len(stacks) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
for _, name := range stacks {
|
||||
w.logger.Printf("[INFO] [storage] Starting stack %s (drive reconnected: %s)", name, path)
|
||||
if err := w.stackProvider.StartStack(name); err != nil {
|
||||
w.logger.Printf("[ERROR] [storage] Failed to start stack %s: %v", name, err)
|
||||
failed = append(failed, name)
|
||||
} else {
|
||||
started = append(started, name)
|
||||
}
|
||||
}
|
||||
|
||||
// Clear stopped stacks list
|
||||
if err := w.settings.ClearStoppedStacks(path); err != nil {
|
||||
w.logger.Printf("[ERROR] [storage] Failed to clear stopped stacks: %v", err)
|
||||
}
|
||||
|
||||
return started, failed
|
||||
}
|
||||
|
||||
// ── Debug simulation methods ─────────────────────────────────────────
|
||||
|
||||
// isSimulated returns true if the path is in simulated-disconnect state.
|
||||
func (w *StorageWatchdog) isSimulated(path string) bool {
|
||||
w.simulatedMu.RLock()
|
||||
defer w.simulatedMu.RUnlock()
|
||||
return w.simulatedPaths[path]
|
||||
}
|
||||
|
||||
// SimulateDisconnect simulates a drive disconnection without actually unmounting.
|
||||
// Runs disconnect steps 1,2,4,5,6,7 (skips step 3: lazyUnmount).
|
||||
// Returns the list of stopped stacks.
|
||||
func (w *StorageWatchdog) SimulateDisconnect(ctx context.Context, path string) ([]string, error) {
|
||||
sp := w.findStoragePath(path)
|
||||
if sp == nil {
|
||||
return nil, fmt.Errorf("storage path %q not found", path)
|
||||
}
|
||||
if sp.Disconnected {
|
||||
return nil, fmt.Errorf("drive already disconnected")
|
||||
}
|
||||
if sp.Decommissioned {
|
||||
return nil, fmt.Errorf("drive is decommissioned")
|
||||
}
|
||||
|
||||
label := sp.Label
|
||||
if label == "" {
|
||||
label = sp.Path
|
||||
}
|
||||
w.logger.Printf("[INFO] [storage] (simulation) Simulating disconnect: %s (%s)", path, label)
|
||||
|
||||
// Mark as simulated so the watchdog skips probing this path
|
||||
w.simulatedMu.Lock()
|
||||
w.simulatedPaths[path] = true
|
||||
w.simulatedMu.Unlock()
|
||||
|
||||
// Step 1: Stop affected stacks
|
||||
stoppedStacks := w.stopAffectedStacks(path)
|
||||
|
||||
// Step 2: Mark disconnected in settings
|
||||
if err := w.settings.SetDisconnected(path, true, stoppedStacks); err != nil {
|
||||
w.logger.Printf("[ERROR] [storage] (simulation) Failed to mark disconnected: %v", err)
|
||||
}
|
||||
|
||||
// Step 3: SKIPPED (no lazyUnmount — drive stays physically mounted)
|
||||
|
||||
// Step 4: Update in-memory state
|
||||
state := w.getOrCreateState(path)
|
||||
state.mu.Lock()
|
||||
state.lastStatus = "disconnected"
|
||||
state.probeInterval = disconnectedProbeInterval
|
||||
state.consecutiveFailures = 0
|
||||
state.mu.Unlock()
|
||||
|
||||
// Step 5: Trigger alert refresh
|
||||
if w.alertRefresh != nil {
|
||||
w.alertRefresh()
|
||||
}
|
||||
|
||||
// Step 6: Send notification
|
||||
w.notifier.NotifyStorageDisconnected(label, stoppedStacks)
|
||||
|
||||
// Step 7: Push hub report
|
||||
if w.pushHubReport != nil {
|
||||
go w.pushHubReport()
|
||||
}
|
||||
|
||||
w.logger.Printf("[INFO] [storage] (simulation) Disconnect simulated: %s — %d stack(s) stopped", path, len(stoppedStacks))
|
||||
return stoppedStacks, nil
|
||||
}
|
||||
|
||||
// SimulateReconnect undoes a simulated disconnection.
|
||||
func (w *StorageWatchdog) SimulateReconnect(ctx context.Context, path string) error {
|
||||
if !w.isSimulated(path) {
|
||||
return fmt.Errorf("path %q is not in simulated-disconnect state", path)
|
||||
}
|
||||
|
||||
sp := w.findStoragePath(path)
|
||||
if sp == nil {
|
||||
return fmt.Errorf("storage path %q not found", path)
|
||||
}
|
||||
|
||||
label := sp.Label
|
||||
if label == "" {
|
||||
label = sp.Path
|
||||
}
|
||||
w.logger.Printf("[INFO] [storage] (simulation) Simulating reconnect: %s (%s)", path, label)
|
||||
|
||||
// Remove from simulated set
|
||||
w.simulatedMu.Lock()
|
||||
delete(w.simulatedPaths, path)
|
||||
w.simulatedMu.Unlock()
|
||||
|
||||
// Verify drive is actually still mounted (it should be since we never unmounted)
|
||||
verifyResult := system.ProbeStoragePath(path)
|
||||
if verifyResult.Status != system.ProbeConnected {
|
||||
return fmt.Errorf("drive probe failed after simulation clear: %v", verifyResult.Err)
|
||||
}
|
||||
|
||||
// Clean restic locks
|
||||
w.cleanResticLocks(ctx, path)
|
||||
|
||||
// Validate stopped stacks
|
||||
filteredStacks := w.filterStoppedStacks(sp.StoppedStacks)
|
||||
|
||||
// Clear disconnected, preserve stopped stacks for restart UI
|
||||
if err := w.settings.SetDisconnected(path, false, filteredStacks); err != nil {
|
||||
w.logger.Printf("[ERROR] [storage] (simulation) Failed to clear disconnected: %v", err)
|
||||
}
|
||||
|
||||
// Update in-memory state
|
||||
state := w.getOrCreateState(path)
|
||||
state.mu.Lock()
|
||||
state.lastStatus = "connected"
|
||||
state.probeInterval = defaultProbeInterval
|
||||
state.consecutiveFailures = 0
|
||||
state.mu.Unlock()
|
||||
|
||||
// Trigger alert refresh
|
||||
if w.alertRefresh != nil {
|
||||
w.alertRefresh()
|
||||
}
|
||||
|
||||
// Send notification
|
||||
w.notifier.NotifyStorageReconnected(label)
|
||||
if w.pushHubReport != nil {
|
||||
go w.pushHubReport()
|
||||
}
|
||||
|
||||
w.logger.Printf("[INFO] [storage] (simulation) Reconnect simulated: %s", path)
|
||||
return nil
|
||||
}
|
||||
|
||||
// PathDebugStatus holds per-path probe state for the debug page.
|
||||
type PathDebugStatus struct {
|
||||
Path string `json:"path"`
|
||||
Label string `json:"label"`
|
||||
Status string `json:"status"`
|
||||
Simulated bool `json:"simulated"`
|
||||
ProbeOK bool `json:"probe_ok"`
|
||||
DebounceCount int `json:"debounce_count"`
|
||||
DebounceMax int `json:"debounce_max"`
|
||||
LastProbe time.Time `json:"last_probe"`
|
||||
AvgLatencyMs float64 `json:"avg_latency_ms"`
|
||||
ProbeCount int `json:"probe_count"`
|
||||
ProbeOKCount int `json:"probe_ok_count"`
|
||||
}
|
||||
|
||||
// GetDebugStatus returns per-path probe state for the debug page.
|
||||
func (w *StorageWatchdog) GetDebugStatus() []PathDebugStatus {
|
||||
paths := w.settings.GetStoragePaths()
|
||||
result := make([]PathDebugStatus, 0, len(paths))
|
||||
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
|
||||
for _, sp := range paths {
|
||||
if sp.Decommissioned {
|
||||
continue
|
||||
}
|
||||
ds := PathDebugStatus{
|
||||
Path: sp.Path,
|
||||
Label: sp.Label,
|
||||
DebounceMax: probeThreshold,
|
||||
}
|
||||
if sp.Disconnected {
|
||||
ds.Status = "disconnected"
|
||||
} else {
|
||||
ds.Status = "connected"
|
||||
}
|
||||
ds.Simulated = w.isSimulatedLocked(sp.Path)
|
||||
|
||||
if state, ok := w.pathState[sp.Path]; ok {
|
||||
state.mu.Lock()
|
||||
ds.DebounceCount = state.consecutiveFailures
|
||||
ds.LastProbe = state.lastProbeTime
|
||||
ds.ProbeOK = state.lastStatus == "connected"
|
||||
ds.ProbeCount = state.probeCount
|
||||
ds.ProbeOKCount = state.probeOKCount
|
||||
if state.probeCount > 0 {
|
||||
ds.AvgLatencyMs = float64(state.totalLatency.Milliseconds()) / float64(state.probeCount)
|
||||
}
|
||||
state.mu.Unlock()
|
||||
}
|
||||
result = append(result, ds)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// isSimulatedLocked checks simulation state without acquiring simulatedMu
|
||||
// (caller must hold w.mu or be ok with a racy read for debug display).
|
||||
func (w *StorageWatchdog) isSimulatedLocked(path string) bool {
|
||||
w.simulatedMu.RLock()
|
||||
defer w.simulatedMu.RUnlock()
|
||||
return w.simulatedPaths[path]
|
||||
}
|
||||
|
||||
// findStoragePath returns the storage path entry for a given path, or nil.
|
||||
func (w *StorageWatchdog) findStoragePath(path string) *settings.StoragePath {
|
||||
for _, sp := range w.settings.GetStoragePaths() {
|
||||
if sp.Path == path {
|
||||
return &sp
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user