health-probes: clear stale results on start/restart, fast 10s probing until healthy
- Clear HealthProbe on StartStack/RestartStack so stale unhealthy state isn't re-applied by RefreshStatus - Use 10s probe interval for unhealthy/new stacks (nil HealthProbe probes immediately on next tick), switch to normal 5m interval once healthy - Scheduler frequency 1m → 10s to support fast probing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -222,7 +222,7 @@ func main() {
|
|||||||
sched.Every("stack-scan", 2*time.Minute, func(ctx context.Context) error {
|
sched.Every("stack-scan", 2*time.Minute, func(ctx context.Context) error {
|
||||||
return stackMgr.ScanStacks()
|
return stackMgr.ScanStacks()
|
||||||
})
|
})
|
||||||
sched.Every("health-probes", 1*time.Minute, func(ctx context.Context) error {
|
sched.Every("health-probes", 10*time.Second, func(ctx context.Context) error {
|
||||||
return stackMgr.RunHealthProbes()
|
return stackMgr.RunHealthProbes()
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
@@ -33,10 +33,18 @@ func (m *Manager) RunHealthProbes() error {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if interval has elapsed since last probe
|
// Check if interval has elapsed since last probe.
|
||||||
|
// Fast 10s probes until healthy, then normal interval (default 5m).
|
||||||
|
// When HealthProbe is nil (just started/restarted), probe immediately.
|
||||||
interval := parseInterval(hc.Interval)
|
interval := parseInterval(hc.Interval)
|
||||||
if stack.HealthProbe != nil && time.Since(stack.HealthProbe.LastCheck) < interval {
|
if stack.HealthProbe != nil {
|
||||||
continue
|
effectiveInterval := interval
|
||||||
|
if !stack.HealthProbe.Healthy {
|
||||||
|
effectiveInterval = 10 * time.Second
|
||||||
|
}
|
||||||
|
if time.Since(stack.HealthProbe.LastCheck) < effectiveInterval {
|
||||||
|
continue
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find the main container to probe (matching stack name)
|
// Find the main container to probe (matching stack name)
|
||||||
|
|||||||
@@ -576,6 +576,15 @@ func (m *Manager) StartStack(name string) error {
|
|||||||
|
|
||||||
m.logger.Printf("[INFO] Stack %s started successfully (took %.1fs)", name, time.Since(start).Seconds())
|
m.logger.Printf("[INFO] Stack %s started successfully (took %.1fs)", name, time.Since(start).Seconds())
|
||||||
m.logPostStartStatus(name, dir, env)
|
m.logPostStartStatus(name, dir, env)
|
||||||
|
|
||||||
|
// Clear stale health probe so refreshStatus won't re-apply an old unhealthy override.
|
||||||
|
// The next health-probes tick (≤10s) will run a fresh probe.
|
||||||
|
m.mu.Lock()
|
||||||
|
if s, ok := m.stacks[name]; ok {
|
||||||
|
s.HealthProbe = nil
|
||||||
|
}
|
||||||
|
m.mu.Unlock()
|
||||||
|
|
||||||
return m.RefreshStatus()
|
return m.RefreshStatus()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -624,6 +633,14 @@ func (m *Manager) RestartStack(name string) error {
|
|||||||
|
|
||||||
m.logger.Printf("[INFO] Stack %s restarted successfully (took %.1fs)", name, time.Since(start).Seconds())
|
m.logger.Printf("[INFO] Stack %s restarted successfully (took %.1fs)", name, time.Since(start).Seconds())
|
||||||
m.logPostStartStatus(name, dir, env)
|
m.logPostStartStatus(name, dir, env)
|
||||||
|
|
||||||
|
// Clear stale health probe so refreshStatus won't re-apply an old unhealthy override.
|
||||||
|
m.mu.Lock()
|
||||||
|
if s, ok := m.stacks[name]; ok {
|
||||||
|
s.HealthProbe = nil
|
||||||
|
}
|
||||||
|
m.mu.Unlock()
|
||||||
|
|
||||||
return m.RefreshStatus()
|
return m.RefreshStatus()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user