Files
godoxy-yusing/internal/health/monitor/monitor.go
yusing 322bb70f02 feat(monitor): add display name support for health monitor logging
Add a DisplayNameKey struct to pass display names from routes through the task
parent hierarchy to the health monitor. This allows the health monitor to use
more descriptive names for logging instead of internal task names.

BREAKING CHANGE: The monitor.DisplayNameKey struct is now part of the public API
2026-02-13 22:12:19 +08:00

308 lines
7.8 KiB
Go

package monitor
import (
"context"
"errors"
"fmt"
"math/rand"
"net/url"
"sync/atomic"
"time"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
config "github.com/yusing/godoxy/internal/config/types"
"github.com/yusing/godoxy/internal/notif"
"github.com/yusing/godoxy/internal/types"
"github.com/yusing/goutils/events"
strutils "github.com/yusing/goutils/strings"
"github.com/yusing/goutils/synk"
"github.com/yusing/goutils/task"
)
type (
DisplayNameKey struct{}
HealthCheckFunc func(url *url.URL) (result types.HealthCheckResult, err error)
monitor struct {
service string
config types.HealthCheckConfig
url synk.Value[*url.URL]
onUpdateURL func(url *url.URL)
status synk.Value[types.HealthStatus]
lastResult synk.Value[types.HealthCheckResult]
checkHealth HealthCheckFunc
startTime time.Time
notifyFunc notif.NotifyFunc
numConsecFailures atomic.Int64
downNotificationSent atomic.Bool
task *task.Task
}
)
var ErrNegativeInterval = errors.New("negative interval")
func (mon *monitor) init(u *url.URL, cfg types.HealthCheckConfig, healthCheckFunc HealthCheckFunc) {
if state := config.WorkingState.Load(); state != nil {
cfg.ApplyDefaults(state.Value().Defaults.HealthCheck)
} else {
cfg.ApplyDefaults(types.HealthCheckConfig{}) // use defaults from constants
}
mon.config = cfg
mon.checkHealth = healthCheckFunc
mon.startTime = time.Now()
mon.notifyFunc = notif.Notify
mon.status.Store(types.StatusHealthy)
mon.lastResult.Store(types.HealthCheckResult{Healthy: true, Detail: "started"})
if u == nil {
mon.url.Store(&url.URL{})
} else {
mon.url.Store(u)
}
}
func (mon *monitor) Context() context.Context {
if mon.config.BaseContext != nil {
return mon.config.BaseContext()
}
if mon.task != nil {
return mon.task.Context()
}
return context.Background()
}
func (mon *monitor) CheckHealth() (types.HealthCheckResult, error) {
return mon.checkHealth(mon.url.Load())
}
// Start implements task.TaskStarter.
func (mon *monitor) Start(parent task.Parent) error {
if mon.config.Interval <= 0 {
return ErrNegativeInterval
}
mon.task = parent.Subtask("health_monitor", true)
mon.service = parent.Name()
if displayName, ok := parent.GetValue(DisplayNameKey{}).(string); ok {
mon.service = displayName
}
go func() {
logger := log.With().Str("name", mon.service).Logger()
defer func() {
if mon.status.Load() != types.StatusError {
mon.status.Store(types.StatusUnhealthy)
}
mon.task.Finish(nil)
}()
failures := 0
if err := mon.checkUpdateHealth(); err != nil {
logger.Err(err).Msg("healthchecker error")
failures++
}
// add a random delay between 0 and 10 seconds to avoid thundering herd
time.Sleep(time.Duration(rand.Intn(10)) * time.Second)
ticker := time.NewTicker(mon.config.Interval)
defer ticker.Stop()
for {
select {
case <-mon.task.Context().Done():
return
case <-ticker.C:
err := mon.checkUpdateHealth()
if err != nil {
logger.Err(err).Msg("healthchecker error")
failures++
} else {
failures = 0
}
if failures >= 5 {
mon.status.Store(types.StatusError)
mon.task.Finish(err)
logger.Error().Msg("healthchecker stopped after 5 trials")
return
}
}
}
}()
return nil
}
// Task implements task.TaskStarter.
func (mon *monitor) Task() *task.Task {
return mon.task
}
// Finish implements task.TaskFinisher.
func (mon *monitor) Finish(reason any) {
if mon.task != nil {
mon.task.Finish(reason)
}
}
// UpdateURL implements HealthChecker.
func (mon *monitor) UpdateURL(url *url.URL) {
if url == nil {
log.Warn().Msg("attempting to update health monitor URL with nil")
return
}
mon.url.Store(url)
if mon.onUpdateURL != nil {
mon.onUpdateURL(url)
}
}
// URL implements HealthChecker.
func (mon *monitor) URL() *url.URL {
return mon.url.Load()
}
// Config implements HealthChecker.
func (mon *monitor) Config() *types.HealthCheckConfig {
return &mon.config
}
// Status implements HealthMonitor.
func (mon *monitor) Status() types.HealthStatus {
return mon.status.Load()
}
// Uptime implements HealthMonitor.
func (mon *monitor) Uptime() time.Duration {
return time.Since(mon.startTime)
}
// Latency implements HealthMonitor.
func (mon *monitor) Latency() time.Duration {
res := mon.lastResult.Load()
return res.Latency
}
// Detail implements HealthMonitor.
func (mon *monitor) Detail() string {
res := mon.lastResult.Load()
return res.Detail
}
// Name implements HealthMonitor.
func (mon *monitor) Name() string {
parts := strutils.SplitRune(mon.service, '/')
return parts[len(parts)-1]
}
// String implements fmt.Stringer of HealthMonitor.
func (mon *monitor) String() string {
return mon.Name()
}
// MarshalJSON implements health.HealthMonitor.
func (mon *monitor) MarshalJSON() ([]byte, error) {
res := mon.lastResult.Load()
return (&types.HealthJSONRepr{
Name: mon.service,
Config: &mon.config,
Status: mon.status.Load(),
Started: mon.startTime,
Uptime: mon.Uptime(),
Latency: res.Latency,
LastSeen: GetLastSeen(mon.service),
Detail: res.Detail,
URL: mon.url.Load(),
}).MarshalJSON()
}
func (mon *monitor) checkUpdateHealth() error {
logger := log.With().Str("name", mon.Name()).Logger()
result, err := mon.checkHealth(mon.url.Load())
var lastStatus types.HealthStatus
switch {
case err != nil:
result = types.HealthCheckResult{Healthy: false, Detail: err.Error()}
lastStatus = mon.status.Swap(types.StatusError)
case result.Healthy:
lastStatus = mon.status.Swap(types.StatusHealthy)
UpdateLastSeen(mon.service)
default:
lastStatus = mon.status.Swap(types.StatusUnhealthy)
}
mon.lastResult.Store(result)
// change of status
if result.Healthy != (lastStatus == types.StatusHealthy) {
if result.Healthy {
mon.notifyServiceUp(&logger, &result)
mon.numConsecFailures.Store(0)
mon.downNotificationSent.Store(false) // Reset notification state when service comes back up
} else if mon.config.Retries < 0 {
// immediate notification when retries < 0
mon.notifyServiceDown(&logger, &result)
mon.downNotificationSent.Store(true)
}
}
// if threshold >= 0, notify after threshold consecutive failures (but only once)
if !result.Healthy && mon.config.Retries >= 0 {
failureCount := mon.numConsecFailures.Add(1)
if failureCount >= mon.config.Retries && !mon.downNotificationSent.Load() {
mon.notifyServiceDown(&logger, &result)
mon.downNotificationSent.Store(true)
}
}
return err
}
func (mon *monitor) notifyServiceUp(logger *zerolog.Logger, result *types.HealthCheckResult) {
logger.Info().Msg("service is up")
extras := mon.buildNotificationExtras(result)
extras.Add("Ping", fmt.Sprintf("%d ms", result.Latency.Milliseconds()))
mon.notifyFunc(&notif.LogMessage{
Level: zerolog.InfoLevel,
Title: "✅ Service is up ✅",
Body: extras,
Color: notif.ColorSuccess,
})
events.Global.Add(events.NewEvent(events.LevelInfo, "health", "service_up", mon))
}
func (mon *monitor) notifyServiceDown(logger *zerolog.Logger, result *types.HealthCheckResult) {
logger.Warn().Str("detail", result.Detail).Msg("service went down")
extras := mon.buildNotificationExtras(result)
extras.Add("Last Seen", strutils.FormatLastSeen(GetLastSeen(mon.service)))
mon.notifyFunc(&notif.LogMessage{
Level: zerolog.WarnLevel,
Title: "❌ Service went down ❌",
Body: extras,
Color: notif.ColorError,
})
events.Global.Add(events.NewEvent(events.LevelWarn, "health", "service_down", mon))
}
func (mon *monitor) buildNotificationExtras(result *types.HealthCheckResult) notif.FieldsBody {
extras := notif.FieldsBody{
{Name: "Service Name", Value: mon.service},
{Name: "Time", Value: strutils.FormatTime(time.Now())},
}
if mon.url.Load() != nil {
extras.Add("Service URL", mon.url.Load().String())
}
if result.Detail != "" {
extras.Add("Detail", result.Detail)
}
return extras
}