perf(healthcheck): stop docker client from hogging resources in health checks

This commit is contained in:
yusing
2025-10-18 19:35:32 +08:00
parent f8716d990e
commit 1ca4b4939e
4 changed files with 131 additions and 18 deletions

2
go.mod
View File

@@ -117,7 +117,7 @@ require (
github.com/sony/gobreaker v1.0.0 // indirect github.com/sony/gobreaker v1.0.0 // indirect
github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect
go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0
go.opentelemetry.io/otel v1.38.0 // indirect go.opentelemetry.io/otel v1.38.0 // indirect
go.opentelemetry.io/otel/metric v1.38.0 // indirect go.opentelemetry.io/otel/metric v1.38.0 // indirect
go.opentelemetry.io/otel/trace v1.38.0 // indirect go.opentelemetry.io/otel/trace v1.38.0 // indirect

Submodule goutils updated: d25032447f...7fe25e088e

View File

@@ -7,16 +7,20 @@ import (
"maps" "maps"
"net" "net"
"net/http" "net/http"
"reflect"
"sync" "sync"
"sync/atomic" "sync/atomic"
"time" "time"
"unsafe"
"github.com/docker/cli/cli/connhelper" "github.com/docker/cli/cli/connhelper"
"github.com/docker/docker/client" "github.com/docker/docker/client"
"github.com/rs/zerolog/log" "github.com/rs/zerolog/log"
"github.com/yusing/godoxy/agent/pkg/agent" "github.com/yusing/godoxy/agent/pkg/agent"
"github.com/yusing/godoxy/internal/common" "github.com/yusing/godoxy/internal/common"
httputils "github.com/yusing/goutils/http"
"github.com/yusing/goutils/task" "github.com/yusing/goutils/task"
"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
) )
// TODO: implement reconnect here. // TODO: implement reconnect here.
@@ -30,6 +34,8 @@ type (
key string key string
addr string addr string
dial func(ctx context.Context) (net.Conn, error) dial func(ctx context.Context) (net.Conn, error)
unique bool
} }
) )
@@ -114,16 +120,23 @@ func Clients() map[string]*SharedClient {
// Returns: // Returns:
// - Client: the Docker client connection. // - Client: the Docker client connection.
// - error: an error if the connection failed. // - error: an error if the connection failed.
func NewClient(host string) (*SharedClient, error) { func NewClient(host string, unique ...bool) (*SharedClient, error) {
initClientCleanerOnce.Do(initClientCleaner) initClientCleanerOnce.Do(initClientCleaner)
clientMapMu.Lock() u := false
defer clientMapMu.Unlock() if len(unique) > 0 {
u = unique[0]
}
if client, ok := clientMap[host]; ok { if !u {
client.closedOn.Store(0) clientMapMu.Lock()
client.refCount.Add(1) defer clientMapMu.Unlock()
return client, nil
if client, ok := clientMap[host]; ok {
client.closedOn.Store(0)
client.refCount.Add(1)
return client, nil
}
} }
// create client // create client
@@ -188,7 +201,9 @@ func NewClient(host string) (*SharedClient, error) {
addr: addr, addr: addr,
key: host, key: host,
dial: dial, dial: dial,
unique: u,
} }
c.unotel()
c.refCount.Store(1) c.refCount.Store(1)
// non-agent client // non-agent client
@@ -201,10 +216,28 @@ func NewClient(host string) (*SharedClient, error) {
defer log.Debug().Str("host", host).Msg("docker client initialized") defer log.Debug().Str("host", host).Msg("docker client initialized")
clientMap[c.Key()] = c if !u {
clientMap[c.Key()] = c
}
return c, nil return c, nil
} }
func (c *SharedClient) GetHTTPClient() **http.Client {
return (**http.Client)(unsafe.Pointer(uintptr(unsafe.Pointer(c.Client)) + clientClientOffset))
}
func (c *SharedClient) InterceptHTTPClient(intercept httputils.InterceptFunc) {
httpClient := *c.GetHTTPClient()
httpClient.Transport = httputils.NewInterceptedTransport(httpClient.Transport, intercept)
}
func (c *SharedClient) CloneUnique() *SharedClient {
// there will be no error here
// since we are using the same host from a valid client.
c, _ = NewClient(c.key, true)
return c
}
func (c *SharedClient) Key() string { func (c *SharedClient) Key() string {
return c.key return c.key
} }
@@ -222,8 +255,41 @@ func (c *SharedClient) CheckConnection(ctx context.Context) error {
return nil return nil
} }
// if the client is still referenced, this is no-op. // for shared clients, if the client is still referenced, this is no-op.
func (c *SharedClient) Close() { func (c *SharedClient) Close() {
if c.unique {
c.Client.Close()
return
}
c.closedOn.Store(time.Now().Unix()) c.closedOn.Store(time.Now().Unix())
c.refCount.Add(-1) c.refCount.Add(-1)
} }
var clientClientOffset = func() uintptr {
field, ok := reflect.TypeFor[client.Client]().FieldByName("client")
if !ok {
panic("client.Client has no client field")
}
return field.Offset
}()
var otelRtOffset = func() uintptr {
field, ok := reflect.TypeFor[otelhttp.Transport]().FieldByName("rt")
if !ok {
panic("otelhttp.Transport has no rt field")
}
return field.Offset
}()
func (c *SharedClient) unotel() {
// we don't need and don't want otelhttp.Transport here.
httpClient := *c.GetHTTPClient()
otelTransport, ok := httpClient.Transport.(*otelhttp.Transport)
if !ok {
log.Debug().Str("host", c.DaemonHost()).Msgf("docker client transport is not an otelhttp.Transport: %T", httpClient.Transport)
return
}
transport := *(*http.RoundTripper)(unsafe.Pointer(uintptr(unsafe.Pointer(otelTransport)) + otelRtOffset))
httpClient.Transport = transport
}

View File

@@ -1,9 +1,16 @@
package monitor package monitor
import ( import (
"net/http"
"github.com/bytedance/sonic"
"github.com/docker/docker/api/types/container" "github.com/docker/docker/api/types/container"
"github.com/rs/zerolog/log"
"github.com/yusing/godoxy/internal/docker" "github.com/yusing/godoxy/internal/docker"
"github.com/yusing/godoxy/internal/types" "github.com/yusing/godoxy/internal/types"
gperr "github.com/yusing/goutils/errs"
httputils "github.com/yusing/goutils/http"
"github.com/yusing/goutils/task"
) )
type DockerHealthMonitor struct { type DockerHealthMonitor struct {
@@ -27,6 +34,41 @@ func NewDockerHealthMonitor(client *docker.SharedClient, containerID, alias stri
return mon return mon
} }
func (mon *DockerHealthMonitor) Start(parent task.Parent) gperr.Error {
mon.client = mon.client.CloneUnique()
err := mon.monitor.Start(parent)
if err != nil {
return err
}
mon.client.InterceptHTTPClient(mon.interceptInspectResponse)
mon.monitor.task.OnFinished("close docker client", mon.client.Close)
return nil
}
type inspectState struct {
State *container.State
}
func (mon *DockerHealthMonitor) interceptInspectResponse(resp *http.Response) (intercepted bool, err error) {
if resp.StatusCode != http.StatusOK {
return false, nil
}
body, release, err := httputils.ReadAllBody(resp)
resp.Body.Close()
if err != nil {
return false, err
}
var state inspectState
err = sonic.Unmarshal(body, &state)
release(body)
if err != nil {
return false, err
}
return true, httputils.NewRequestInterceptedError(resp, state)
}
func (mon *DockerHealthMonitor) CheckHealth() (types.HealthCheckResult, error) { func (mon *DockerHealthMonitor) CheckHealth() (types.HealthCheckResult, error) {
// if docker health check failed too many times, use fallback forever // if docker health check failed too many times, use fallback forever
if mon.numDockerFailures > dockerFailuresThreshold { if mon.numDockerFailures > dockerFailuresThreshold {
@@ -36,13 +78,18 @@ func (mon *DockerHealthMonitor) CheckHealth() (types.HealthCheckResult, error) {
ctx, cancel := mon.ContextWithTimeout("docker health check timed out") ctx, cancel := mon.ContextWithTimeout("docker health check timed out")
defer cancel() defer cancel()
cont, err := mon.client.ContainerInspect(ctx, mon.containerID) // the actual inspect response is intercepted and returned as RequestInterceptedError
if err != nil { _, err := mon.client.ContainerInspect(ctx, mon.containerID)
var interceptedErr *httputils.RequestInterceptedError
if err != nil && !httputils.AsRequestInterceptedError(err, &interceptedErr) {
mon.numDockerFailures++ mon.numDockerFailures++
log.Debug().Err(err).Str("container_id", mon.containerID).Msg("docker health check failed, using fallback")
return mon.fallback.CheckHealth() return mon.fallback.CheckHealth()
} }
status := cont.State.Status state := interceptedErr.Data.(inspectState).State
status := state.Status
switch status { switch status {
case "dead", "exited", "paused", "restarting", "removing": case "dead", "exited", "paused", "restarting", "removing":
mon.numDockerFailures = 0 mon.numDockerFailures = 0
@@ -57,17 +104,17 @@ func (mon *DockerHealthMonitor) CheckHealth() (types.HealthCheckResult, error) {
Detail: "container is not started", Detail: "container is not started",
}, nil }, nil
} }
if cont.State.Health == nil { // no health check from docker, directly use fallback starting from next check if state.Health == nil { // no health check from docker, directly use fallback starting from next check
mon.numDockerFailures = dockerFailuresThreshold + 1 mon.numDockerFailures = dockerFailuresThreshold + 1
return mon.fallback.CheckHealth() return mon.fallback.CheckHealth()
} }
mon.numDockerFailures = 0 mon.numDockerFailures = 0
result := types.HealthCheckResult{ result := types.HealthCheckResult{
Healthy: cont.State.Health.Status == container.Healthy, Healthy: state.Health.Status == container.Healthy,
} }
if len(cont.State.Health.Log) > 0 { if len(state.Health.Log) > 0 {
lastLog := cont.State.Health.Log[len(cont.State.Health.Log)-1] lastLog := state.Health.Log[len(state.Health.Log)-1]
result.Detail = lastLog.Output result.Detail = lastLog.Output
result.Latency = lastLog.End.Sub(lastLog.Start) result.Latency = lastLog.End.Sub(lastLog.Start)
} }