refactor(idlewatcher): improve container readiness handling and health check logic

- Simplified the wakeFromHTTP and wakeFromStream methods by removing unnecessary loops and integrating direct checks for container readiness.
- Introduced a waitForReady method to streamline the waiting process for container readiness notifications.
- Enhanced the checkUpdateState method to include timeout detection for container startup.
- Added health check retries and logging for better monitoring of container state transitions.
This commit is contained in:
yusing
2025-09-05 14:36:38 +08:00
parent b43274e9e6
commit 577169d03c
5 changed files with 164 additions and 85 deletions

View File

@@ -3,7 +3,7 @@ package idlewatcher
import (
"context"
"errors"
"maps"
"math"
"strings"
"sync"
"time"
@@ -37,9 +37,11 @@ type (
}
containerState struct {
status idlewatcher.ContainerStatus
ready bool
err error
status idlewatcher.ContainerStatus
ready bool
err error
startedAt time.Time // when container started (for timeout detection)
healthTries int // number of failed health check attempts
}
Watcher struct {
@@ -55,8 +57,10 @@ type (
state atomic.Value[*containerState]
lastReset atomic.Value[time.Time]
idleTicker *time.Ticker
task *task.Task
idleTicker *time.Ticker
healthTicker *time.Ticker
readyNotifyCh chan struct{} // notifies when container becomes ready
task *task.Task
dependsOn []*dependency
}
@@ -78,15 +82,10 @@ var (
)
const (
idleWakerCheckInterval = 100 * time.Millisecond
idleWakerCheckInterval = 200 * time.Millisecond
idleWakerCheckTimeout = time.Second
)
var dummyHealthCheckConfig = &types.HealthCheckConfig{
Interval: idleWakerCheckInterval,
Timeout: idleWakerCheckTimeout,
}
var (
causeReload = gperr.New("reloaded") //nolint:errname
causeContainerDestroy = gperr.New("container destroyed") //nolint:errname
@@ -116,8 +115,10 @@ func NewWatcher(parent task.Parent, r types.Route, cfg *types.IdlewatcherConfig)
w.resetIdleTimer()
} else {
w = &Watcher{
idleTicker: time.NewTicker(cfg.IdleTimeout),
cfg: cfg,
idleTicker: time.NewTicker(cfg.IdleTimeout),
healthTicker: time.NewTicker(idleWakerCheckInterval),
readyNotifyCh: make(chan struct{}, 1), // buffered to avoid blocking
cfg: cfg,
routeHelper: routeHelper{
hc: monitor.NewMonitor(r),
},
@@ -304,6 +305,8 @@ func NewWatcher(parent task.Parent, r types.Route, cfg *types.IdlewatcherConfig)
}
w.idleTicker.Stop()
w.healthTicker.Stop()
close(w.readyNotifyCh)
w.provider.Close()
w.task.Finish(cause)
}()
@@ -373,17 +376,25 @@ func (w *Watcher) wakeDependencies(ctx context.Context) error {
return err
}
if dep.waitHealthy {
// initial health check before starting the ticker
if h, err := dep.hc.CheckHealth(); err != nil {
return err
} else if h.Healthy {
return nil
}
tick := time.NewTicker(idleWakerCheckInterval)
defer tick.Stop()
for {
select {
case <-ctx.Done():
return w.newDepError("wait_healthy", dep, context.Cause(ctx))
default:
case <-tick.C:
if h, err := dep.hc.CheckHealth(); err != nil {
return err
} else if h.Healthy {
return nil
}
time.Sleep(idleWakerCheckInterval)
}
}
}
@@ -447,7 +458,7 @@ func (w *Watcher) stopByMethod() error {
case types.ContainerStopMethodPause:
err = w.provider.ContainerPause(ctx)
case types.ContainerStopMethodStop:
err = w.provider.ContainerStop(ctx, cfg.StopSignal, int(cfg.StopTimeout.Seconds()))
err = w.provider.ContainerStop(ctx, cfg.StopSignal, int(math.Ceil(cfg.StopTimeout.Seconds())))
case types.ContainerStopMethodKill:
err = w.provider.ContainerKill(ctx, cfg.StopSignal)
default:
@@ -511,16 +522,39 @@ func (w *Watcher) watchUntilDestroy() (returnCause error) {
switch {
case e.Action.IsContainerStart(): // create / start / unpause
w.setStarting()
w.healthTicker.Reset(idleWakerCheckInterval) // start health checking
w.l.Info().Msg("awaken")
case e.Action.IsContainerStop(): // stop / kill / die
w.setNapping(idlewatcher.ContainerStatusStopped)
w.idleTicker.Stop()
w.healthTicker.Stop() // stop health checking
case e.Action.IsContainerPause(): // pause
w.setNapping(idlewatcher.ContainerStatusPaused)
w.idleTicker.Stop()
w.healthTicker.Stop() // stop health checking
default:
w.l.Debug().Stringer("action", e.Action).Msg("unexpected container action")
}
case <-w.healthTicker.C:
// Only check health if container is starting (not ready yet)
if w.running() && !w.ready() {
ready, err := w.checkUpdateState()
if err != nil {
// Health check failed with error, stop health checking
w.healthTicker.Stop()
continue
}
if ready {
// Container is now ready, notify waiting handlers
w.healthTicker.Stop()
select {
case w.readyNotifyCh <- struct{}{}:
default: // channel full, notification already pending
}
w.resetIdleTimer()
}
// If not ready yet, keep checking on next tick
}
case <-w.idleTicker.C:
w.idleTicker.Stop()
if w.running() {