mirror of
https://github.com/yusing/godoxy.git
synced 2026-03-25 18:41:10 +01:00
Add a DisplayNameKey struct to pass display names from routes through the task parent hierarchy to the health monitor. This allows the health monitor to use more descriptive names for logging instead of internal task names. BREAKING CHANGE: The monitor.DisplayNameKey struct is now part of the public API
308 lines
7.8 KiB
Go
308 lines
7.8 KiB
Go
package monitor
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"math/rand"
|
|
"net/url"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/rs/zerolog"
|
|
"github.com/rs/zerolog/log"
|
|
config "github.com/yusing/godoxy/internal/config/types"
|
|
"github.com/yusing/godoxy/internal/notif"
|
|
"github.com/yusing/godoxy/internal/types"
|
|
"github.com/yusing/goutils/events"
|
|
strutils "github.com/yusing/goutils/strings"
|
|
"github.com/yusing/goutils/synk"
|
|
"github.com/yusing/goutils/task"
|
|
)
|
|
|
|
type (
|
|
DisplayNameKey struct{}
|
|
|
|
HealthCheckFunc func(url *url.URL) (result types.HealthCheckResult, err error)
|
|
monitor struct {
|
|
service string
|
|
config types.HealthCheckConfig
|
|
url synk.Value[*url.URL]
|
|
|
|
onUpdateURL func(url *url.URL)
|
|
|
|
status synk.Value[types.HealthStatus]
|
|
lastResult synk.Value[types.HealthCheckResult]
|
|
|
|
checkHealth HealthCheckFunc
|
|
startTime time.Time
|
|
|
|
notifyFunc notif.NotifyFunc
|
|
numConsecFailures atomic.Int64
|
|
downNotificationSent atomic.Bool
|
|
|
|
task *task.Task
|
|
}
|
|
)
|
|
|
|
var ErrNegativeInterval = errors.New("negative interval")
|
|
|
|
func (mon *monitor) init(u *url.URL, cfg types.HealthCheckConfig, healthCheckFunc HealthCheckFunc) {
|
|
if state := config.WorkingState.Load(); state != nil {
|
|
cfg.ApplyDefaults(state.Value().Defaults.HealthCheck)
|
|
} else {
|
|
cfg.ApplyDefaults(types.HealthCheckConfig{}) // use defaults from constants
|
|
}
|
|
mon.config = cfg
|
|
mon.checkHealth = healthCheckFunc
|
|
mon.startTime = time.Now()
|
|
mon.notifyFunc = notif.Notify
|
|
mon.status.Store(types.StatusHealthy)
|
|
mon.lastResult.Store(types.HealthCheckResult{Healthy: true, Detail: "started"})
|
|
|
|
if u == nil {
|
|
mon.url.Store(&url.URL{})
|
|
} else {
|
|
mon.url.Store(u)
|
|
}
|
|
}
|
|
|
|
func (mon *monitor) Context() context.Context {
|
|
if mon.config.BaseContext != nil {
|
|
return mon.config.BaseContext()
|
|
}
|
|
if mon.task != nil {
|
|
return mon.task.Context()
|
|
}
|
|
return context.Background()
|
|
}
|
|
|
|
func (mon *monitor) CheckHealth() (types.HealthCheckResult, error) {
|
|
return mon.checkHealth(mon.url.Load())
|
|
}
|
|
|
|
// Start implements task.TaskStarter.
|
|
func (mon *monitor) Start(parent task.Parent) error {
|
|
if mon.config.Interval <= 0 {
|
|
return ErrNegativeInterval
|
|
}
|
|
|
|
mon.task = parent.Subtask("health_monitor", true)
|
|
|
|
mon.service = parent.Name()
|
|
if displayName, ok := parent.GetValue(DisplayNameKey{}).(string); ok {
|
|
mon.service = displayName
|
|
}
|
|
|
|
go func() {
|
|
logger := log.With().Str("name", mon.service).Logger()
|
|
|
|
defer func() {
|
|
if mon.status.Load() != types.StatusError {
|
|
mon.status.Store(types.StatusUnhealthy)
|
|
}
|
|
mon.task.Finish(nil)
|
|
}()
|
|
|
|
failures := 0
|
|
|
|
if err := mon.checkUpdateHealth(); err != nil {
|
|
logger.Err(err).Msg("healthchecker error")
|
|
failures++
|
|
}
|
|
|
|
// add a random delay between 0 and 10 seconds to avoid thundering herd
|
|
time.Sleep(time.Duration(rand.Intn(10)) * time.Second)
|
|
|
|
ticker := time.NewTicker(mon.config.Interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-mon.task.Context().Done():
|
|
return
|
|
case <-ticker.C:
|
|
err := mon.checkUpdateHealth()
|
|
if err != nil {
|
|
logger.Err(err).Msg("healthchecker error")
|
|
failures++
|
|
} else {
|
|
failures = 0
|
|
}
|
|
if failures >= 5 {
|
|
mon.status.Store(types.StatusError)
|
|
mon.task.Finish(err)
|
|
logger.Error().Msg("healthchecker stopped after 5 trials")
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}()
|
|
return nil
|
|
}
|
|
|
|
// Task implements task.TaskStarter.
|
|
func (mon *monitor) Task() *task.Task {
|
|
return mon.task
|
|
}
|
|
|
|
// Finish implements task.TaskFinisher.
|
|
func (mon *monitor) Finish(reason any) {
|
|
if mon.task != nil {
|
|
mon.task.Finish(reason)
|
|
}
|
|
}
|
|
|
|
// UpdateURL implements HealthChecker.
|
|
func (mon *monitor) UpdateURL(url *url.URL) {
|
|
if url == nil {
|
|
log.Warn().Msg("attempting to update health monitor URL with nil")
|
|
return
|
|
}
|
|
mon.url.Store(url)
|
|
if mon.onUpdateURL != nil {
|
|
mon.onUpdateURL(url)
|
|
}
|
|
}
|
|
|
|
// URL implements HealthChecker.
|
|
func (mon *monitor) URL() *url.URL {
|
|
return mon.url.Load()
|
|
}
|
|
|
|
// Config implements HealthChecker.
|
|
func (mon *monitor) Config() *types.HealthCheckConfig {
|
|
return &mon.config
|
|
}
|
|
|
|
// Status implements HealthMonitor.
|
|
func (mon *monitor) Status() types.HealthStatus {
|
|
return mon.status.Load()
|
|
}
|
|
|
|
// Uptime implements HealthMonitor.
|
|
func (mon *monitor) Uptime() time.Duration {
|
|
return time.Since(mon.startTime)
|
|
}
|
|
|
|
// Latency implements HealthMonitor.
|
|
func (mon *monitor) Latency() time.Duration {
|
|
res := mon.lastResult.Load()
|
|
return res.Latency
|
|
}
|
|
|
|
// Detail implements HealthMonitor.
|
|
func (mon *monitor) Detail() string {
|
|
res := mon.lastResult.Load()
|
|
return res.Detail
|
|
}
|
|
|
|
// Name implements HealthMonitor.
|
|
func (mon *monitor) Name() string {
|
|
parts := strutils.SplitRune(mon.service, '/')
|
|
return parts[len(parts)-1]
|
|
}
|
|
|
|
// String implements fmt.Stringer of HealthMonitor.
|
|
func (mon *monitor) String() string {
|
|
return mon.Name()
|
|
}
|
|
|
|
// MarshalJSON implements health.HealthMonitor.
|
|
func (mon *monitor) MarshalJSON() ([]byte, error) {
|
|
res := mon.lastResult.Load()
|
|
return (&types.HealthJSONRepr{
|
|
Name: mon.service,
|
|
Config: &mon.config,
|
|
Status: mon.status.Load(),
|
|
Started: mon.startTime,
|
|
Uptime: mon.Uptime(),
|
|
Latency: res.Latency,
|
|
LastSeen: GetLastSeen(mon.service),
|
|
Detail: res.Detail,
|
|
URL: mon.url.Load(),
|
|
}).MarshalJSON()
|
|
}
|
|
|
|
func (mon *monitor) checkUpdateHealth() error {
|
|
logger := log.With().Str("name", mon.Name()).Logger()
|
|
result, err := mon.checkHealth(mon.url.Load())
|
|
|
|
var lastStatus types.HealthStatus
|
|
switch {
|
|
case err != nil:
|
|
result = types.HealthCheckResult{Healthy: false, Detail: err.Error()}
|
|
lastStatus = mon.status.Swap(types.StatusError)
|
|
case result.Healthy:
|
|
lastStatus = mon.status.Swap(types.StatusHealthy)
|
|
UpdateLastSeen(mon.service)
|
|
default:
|
|
lastStatus = mon.status.Swap(types.StatusUnhealthy)
|
|
}
|
|
mon.lastResult.Store(result)
|
|
|
|
// change of status
|
|
if result.Healthy != (lastStatus == types.StatusHealthy) {
|
|
if result.Healthy {
|
|
mon.notifyServiceUp(&logger, &result)
|
|
mon.numConsecFailures.Store(0)
|
|
mon.downNotificationSent.Store(false) // Reset notification state when service comes back up
|
|
} else if mon.config.Retries < 0 {
|
|
// immediate notification when retries < 0
|
|
mon.notifyServiceDown(&logger, &result)
|
|
mon.downNotificationSent.Store(true)
|
|
}
|
|
}
|
|
|
|
// if threshold >= 0, notify after threshold consecutive failures (but only once)
|
|
if !result.Healthy && mon.config.Retries >= 0 {
|
|
failureCount := mon.numConsecFailures.Add(1)
|
|
if failureCount >= mon.config.Retries && !mon.downNotificationSent.Load() {
|
|
mon.notifyServiceDown(&logger, &result)
|
|
mon.downNotificationSent.Store(true)
|
|
}
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
func (mon *monitor) notifyServiceUp(logger *zerolog.Logger, result *types.HealthCheckResult) {
|
|
logger.Info().Msg("service is up")
|
|
extras := mon.buildNotificationExtras(result)
|
|
extras.Add("Ping", fmt.Sprintf("%d ms", result.Latency.Milliseconds()))
|
|
mon.notifyFunc(¬if.LogMessage{
|
|
Level: zerolog.InfoLevel,
|
|
Title: "✅ Service is up ✅",
|
|
Body: extras,
|
|
Color: notif.ColorSuccess,
|
|
})
|
|
events.Global.Add(events.NewEvent(events.LevelInfo, "health", "service_up", mon))
|
|
}
|
|
|
|
func (mon *monitor) notifyServiceDown(logger *zerolog.Logger, result *types.HealthCheckResult) {
|
|
logger.Warn().Str("detail", result.Detail).Msg("service went down")
|
|
extras := mon.buildNotificationExtras(result)
|
|
extras.Add("Last Seen", strutils.FormatLastSeen(GetLastSeen(mon.service)))
|
|
mon.notifyFunc(¬if.LogMessage{
|
|
Level: zerolog.WarnLevel,
|
|
Title: "❌ Service went down ❌",
|
|
Body: extras,
|
|
Color: notif.ColorError,
|
|
})
|
|
events.Global.Add(events.NewEvent(events.LevelWarn, "health", "service_down", mon))
|
|
}
|
|
|
|
func (mon *monitor) buildNotificationExtras(result *types.HealthCheckResult) notif.FieldsBody {
|
|
extras := notif.FieldsBody{
|
|
{Name: "Service Name", Value: mon.service},
|
|
{Name: "Time", Value: strutils.FormatTime(time.Now())},
|
|
}
|
|
if mon.url.Load() != nil {
|
|
extras.Add("Service URL", mon.url.Load().String())
|
|
}
|
|
if result.Detail != "" {
|
|
extras.Add("Detail", result.Detail)
|
|
}
|
|
return extras
|
|
}
|