From 2e9634e50ffd72ce5dbd4af7d5397af76c8c8273 Mon Sep 17 00:00:00 2001 From: kisfenyo Date: Wed, 25 Feb 2026 14:59:25 +0100 Subject: [PATCH] health-probes: clear stale results on start/restart, fast 10s probing until healthy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Clear HealthProbe on StartStack/RestartStack so stale unhealthy state isn't re-applied by RefreshStatus - Use 10s probe interval for unhealthy/new stacks (nil HealthProbe probes immediately on next tick), switch to normal 5m interval once healthy - Scheduler frequency 1m → 10s to support fast probing Co-Authored-By: Claude Opus 4.6 --- controller/cmd/controller/main.go | 2 +- controller/internal/stacks/healthprobe.go | 14 +++++++++++--- controller/internal/stacks/manager.go | 17 +++++++++++++++++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/controller/cmd/controller/main.go b/controller/cmd/controller/main.go index efe032f..97d2ade 100644 --- a/controller/cmd/controller/main.go +++ b/controller/cmd/controller/main.go @@ -222,7 +222,7 @@ func main() { sched.Every("stack-scan", 2*time.Minute, func(ctx context.Context) error { return stackMgr.ScanStacks() }) - sched.Every("health-probes", 1*time.Minute, func(ctx context.Context) error { + sched.Every("health-probes", 10*time.Second, func(ctx context.Context) error { return stackMgr.RunHealthProbes() }) diff --git a/controller/internal/stacks/healthprobe.go b/controller/internal/stacks/healthprobe.go index d6cf6dd..97a8218 100644 --- a/controller/internal/stacks/healthprobe.go +++ b/controller/internal/stacks/healthprobe.go @@ -33,10 +33,18 @@ func (m *Manager) RunHealthProbes() error { continue } - // Check if interval has elapsed since last probe + // Check if interval has elapsed since last probe. + // Fast 10s probes until healthy, then normal interval (default 5m). + // When HealthProbe is nil (just started/restarted), probe immediately. interval := parseInterval(hc.Interval) - if stack.HealthProbe != nil && time.Since(stack.HealthProbe.LastCheck) < interval { - continue + if stack.HealthProbe != nil { + effectiveInterval := interval + if !stack.HealthProbe.Healthy { + effectiveInterval = 10 * time.Second + } + if time.Since(stack.HealthProbe.LastCheck) < effectiveInterval { + continue + } } // Find the main container to probe (matching stack name) diff --git a/controller/internal/stacks/manager.go b/controller/internal/stacks/manager.go index ba6cbf6..c52b54b 100644 --- a/controller/internal/stacks/manager.go +++ b/controller/internal/stacks/manager.go @@ -576,6 +576,15 @@ func (m *Manager) StartStack(name string) error { m.logger.Printf("[INFO] Stack %s started successfully (took %.1fs)", name, time.Since(start).Seconds()) m.logPostStartStatus(name, dir, env) + + // Clear stale health probe so refreshStatus won't re-apply an old unhealthy override. + // The next health-probes tick (≤10s) will run a fresh probe. + m.mu.Lock() + if s, ok := m.stacks[name]; ok { + s.HealthProbe = nil + } + m.mu.Unlock() + return m.RefreshStatus() } @@ -624,6 +633,14 @@ func (m *Manager) RestartStack(name string) error { m.logger.Printf("[INFO] Stack %s restarted successfully (took %.1fs)", name, time.Since(start).Seconds()) m.logPostStartStatus(name, dir, env) + + // Clear stale health probe so refreshStatus won't re-apply an old unhealthy override. + m.mu.Lock() + if s, ok := m.stacks[name]; ok { + s.HealthProbe = nil + } + m.mu.Unlock() + return m.RefreshStatus() }