mirror of
https://github.com/yusing/godoxy.git
synced 2026-03-25 10:31:30 +01:00
Previously, up notifications were sent whenever a service recovered, even if no down notification had been sent (e.g., when recovering before the failure threshold was met). This could confuse users who would receive "service is up" notifications without ever being notified of a problem. Now, recovery notifications are only sent when a prior down notification exists, ensuring notification pairs are always complete.
311 lines
7.9 KiB
Go
311 lines
7.9 KiB
Go
package monitor
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"math/rand"
|
|
"net/url"
|
|
"strings"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/rs/zerolog"
|
|
"github.com/rs/zerolog/log"
|
|
config "github.com/yusing/godoxy/internal/config/types"
|
|
"github.com/yusing/godoxy/internal/notif"
|
|
"github.com/yusing/godoxy/internal/types"
|
|
"github.com/yusing/goutils/events"
|
|
strutils "github.com/yusing/goutils/strings"
|
|
"github.com/yusing/goutils/synk"
|
|
"github.com/yusing/goutils/task"
|
|
)
|
|
|
|
type (
|
|
DisplayNameKey struct{}
|
|
|
|
HealthCheckFunc func(url *url.URL) (result types.HealthCheckResult, err error)
|
|
monitor struct {
|
|
service string
|
|
config types.HealthCheckConfig
|
|
url synk.Value[*url.URL]
|
|
|
|
onUpdateURL func(url *url.URL)
|
|
|
|
status synk.Value[types.HealthStatus]
|
|
lastResult synk.Value[types.HealthCheckResult]
|
|
|
|
checkHealth HealthCheckFunc
|
|
startTime time.Time
|
|
|
|
notifyFunc notif.NotifyFunc
|
|
numConsecFailures atomic.Int64
|
|
downNotificationSent atomic.Bool
|
|
|
|
task *task.Task
|
|
}
|
|
)
|
|
|
|
var ErrNegativeInterval = errors.New("negative interval")
|
|
|
|
func (mon *monitor) init(u *url.URL, cfg types.HealthCheckConfig, healthCheckFunc HealthCheckFunc) {
|
|
if state := config.WorkingState.Load(); state != nil {
|
|
cfg.ApplyDefaults(state.Value().Defaults.HealthCheck)
|
|
} else {
|
|
cfg.ApplyDefaults(types.HealthCheckConfig{}) // use defaults from constants
|
|
}
|
|
mon.config = cfg
|
|
mon.checkHealth = healthCheckFunc
|
|
mon.startTime = time.Now()
|
|
mon.notifyFunc = notif.Notify
|
|
mon.status.Store(types.StatusHealthy)
|
|
mon.lastResult.Store(types.HealthCheckResult{Healthy: true, Detail: "started"})
|
|
|
|
if u == nil {
|
|
mon.url.Store(&url.URL{})
|
|
} else {
|
|
mon.url.Store(u)
|
|
}
|
|
}
|
|
|
|
func (mon *monitor) Context() context.Context {
|
|
if mon.config.BaseContext != nil {
|
|
return mon.config.BaseContext()
|
|
}
|
|
if mon.task != nil {
|
|
return mon.task.Context()
|
|
}
|
|
return context.Background()
|
|
}
|
|
|
|
func (mon *monitor) CheckHealth() (types.HealthCheckResult, error) {
|
|
return mon.checkHealth(mon.url.Load())
|
|
}
|
|
|
|
// Start implements task.TaskStarter.
|
|
func (mon *monitor) Start(parent task.Parent) error {
|
|
if mon.config.Interval <= 0 {
|
|
return ErrNegativeInterval
|
|
}
|
|
|
|
mon.task = parent.Subtask("health_monitor", true)
|
|
|
|
mon.service = parent.Name()
|
|
if displayName, ok := parent.GetValue(DisplayNameKey{}).(string); ok {
|
|
mon.service = displayName
|
|
}
|
|
|
|
go func() {
|
|
logger := log.With().Str("name", mon.service).Logger()
|
|
|
|
defer func() {
|
|
if mon.status.Load() != types.StatusError {
|
|
mon.status.Store(types.StatusUnhealthy)
|
|
}
|
|
mon.task.Finish(nil)
|
|
}()
|
|
|
|
failures := 0
|
|
|
|
if err := mon.checkUpdateHealth(); err != nil {
|
|
logger.Err(err).Msg("healthchecker error")
|
|
failures++
|
|
}
|
|
|
|
// add a random delay between 0 and 10 seconds to avoid thundering herd
|
|
time.Sleep(time.Duration(rand.Intn(10)) * time.Second)
|
|
|
|
ticker := time.NewTicker(mon.config.Interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-mon.task.Context().Done():
|
|
return
|
|
case <-ticker.C:
|
|
err := mon.checkUpdateHealth()
|
|
if err != nil {
|
|
logger.Err(err).Msg("healthchecker error")
|
|
failures++
|
|
} else {
|
|
failures = 0
|
|
}
|
|
if failures >= 5 {
|
|
mon.status.Store(types.StatusError)
|
|
mon.task.Finish(err)
|
|
logger.Error().Msg("healthchecker stopped after 5 trials")
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}()
|
|
return nil
|
|
}
|
|
|
|
// Task implements task.TaskStarter.
|
|
func (mon *monitor) Task() *task.Task {
|
|
return mon.task
|
|
}
|
|
|
|
// Finish implements task.TaskFinisher.
|
|
func (mon *monitor) Finish(reason any) {
|
|
if mon.task != nil {
|
|
mon.task.Finish(reason)
|
|
}
|
|
}
|
|
|
|
// UpdateURL implements HealthChecker.
|
|
func (mon *monitor) UpdateURL(url *url.URL) {
|
|
if url == nil {
|
|
log.Warn().Msg("attempting to update health monitor URL with nil")
|
|
return
|
|
}
|
|
mon.url.Store(url)
|
|
if mon.onUpdateURL != nil {
|
|
mon.onUpdateURL(url)
|
|
}
|
|
}
|
|
|
|
// URL implements HealthChecker.
|
|
func (mon *monitor) URL() *url.URL {
|
|
return mon.url.Load()
|
|
}
|
|
|
|
// Config implements HealthChecker.
|
|
func (mon *monitor) Config() *types.HealthCheckConfig {
|
|
return &mon.config
|
|
}
|
|
|
|
// Status implements HealthMonitor.
|
|
func (mon *monitor) Status() types.HealthStatus {
|
|
return mon.status.Load()
|
|
}
|
|
|
|
// Uptime implements HealthMonitor.
|
|
func (mon *monitor) Uptime() time.Duration {
|
|
return time.Since(mon.startTime)
|
|
}
|
|
|
|
// Latency implements HealthMonitor.
|
|
func (mon *monitor) Latency() time.Duration {
|
|
res := mon.lastResult.Load()
|
|
return res.Latency
|
|
}
|
|
|
|
// Detail implements HealthMonitor.
|
|
func (mon *monitor) Detail() string {
|
|
res := mon.lastResult.Load()
|
|
return res.Detail
|
|
}
|
|
|
|
// Name implements HealthMonitor.
|
|
func (mon *monitor) Name() string {
|
|
parts := strings.Split(mon.service, "/")
|
|
return parts[len(parts)-1]
|
|
}
|
|
|
|
// String implements fmt.Stringer of HealthMonitor.
|
|
func (mon *monitor) String() string {
|
|
return mon.Name()
|
|
}
|
|
|
|
// MarshalJSON implements health.HealthMonitor.
|
|
func (mon *monitor) MarshalJSON() ([]byte, error) {
|
|
res := mon.lastResult.Load()
|
|
return (&types.HealthJSONRepr{
|
|
Name: mon.service,
|
|
Config: &mon.config,
|
|
Status: mon.status.Load(),
|
|
Started: mon.startTime,
|
|
Uptime: mon.Uptime(),
|
|
Latency: res.Latency,
|
|
LastSeen: GetLastSeen(mon.service),
|
|
Detail: res.Detail,
|
|
URL: mon.url.Load(),
|
|
}).MarshalJSON()
|
|
}
|
|
|
|
func (mon *monitor) checkUpdateHealth() error {
|
|
logger := log.With().Str("name", mon.Name()).Logger()
|
|
result, err := mon.checkHealth(mon.url.Load())
|
|
|
|
var lastStatus types.HealthStatus
|
|
switch {
|
|
case err != nil:
|
|
result = types.HealthCheckResult{Healthy: false, Detail: err.Error()}
|
|
lastStatus = mon.status.Swap(types.StatusError)
|
|
case result.Healthy:
|
|
lastStatus = mon.status.Swap(types.StatusHealthy)
|
|
UpdateLastSeen(mon.service)
|
|
default:
|
|
lastStatus = mon.status.Swap(types.StatusUnhealthy)
|
|
}
|
|
mon.lastResult.Store(result)
|
|
|
|
// change of status
|
|
if result.Healthy != (lastStatus == types.StatusHealthy) {
|
|
if result.Healthy {
|
|
mon.numConsecFailures.Store(0)
|
|
if mon.downNotificationSent.Load() { // Only notify if the service down has been notified
|
|
mon.notifyServiceUp(&logger, &result)
|
|
mon.downNotificationSent.Store(false) // Reset notification state when service comes back up
|
|
}
|
|
} else if mon.config.Retries < 0 {
|
|
// immediate notification when retries < 0
|
|
mon.notifyServiceDown(&logger, &result)
|
|
mon.downNotificationSent.Store(true)
|
|
}
|
|
}
|
|
|
|
// if threshold >= 0, notify after threshold consecutive failures (but only once)
|
|
if !result.Healthy && mon.config.Retries >= 0 {
|
|
failureCount := mon.numConsecFailures.Add(1)
|
|
if failureCount >= mon.config.Retries && !mon.downNotificationSent.Load() {
|
|
mon.notifyServiceDown(&logger, &result)
|
|
mon.downNotificationSent.Store(true)
|
|
}
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
func (mon *monitor) notifyServiceUp(logger *zerolog.Logger, result *types.HealthCheckResult) {
|
|
logger.Info().Msg("service is up")
|
|
extras := mon.buildNotificationExtras(result)
|
|
extras.Add("Ping", fmt.Sprintf("%d ms", result.Latency.Milliseconds()))
|
|
mon.notifyFunc(¬if.LogMessage{
|
|
Level: zerolog.InfoLevel,
|
|
Title: "✅ Service is up ✅",
|
|
Body: extras,
|
|
Color: notif.ColorSuccess,
|
|
})
|
|
events.Global.Add(events.NewEvent(events.LevelInfo, "health", "service_up", mon))
|
|
}
|
|
|
|
func (mon *monitor) notifyServiceDown(logger *zerolog.Logger, result *types.HealthCheckResult) {
|
|
logger.Warn().Str("detail", result.Detail).Msg("service went down")
|
|
extras := mon.buildNotificationExtras(result)
|
|
extras.Add("Last Seen", strutils.FormatLastSeen(GetLastSeen(mon.service)))
|
|
mon.notifyFunc(¬if.LogMessage{
|
|
Level: zerolog.WarnLevel,
|
|
Title: "❌ Service went down ❌",
|
|
Body: extras,
|
|
Color: notif.ColorError,
|
|
})
|
|
events.Global.Add(events.NewEvent(events.LevelWarn, "health", "service_down", mon))
|
|
}
|
|
|
|
func (mon *monitor) buildNotificationExtras(result *types.HealthCheckResult) notif.FieldsBody {
|
|
extras := notif.FieldsBody{
|
|
{Name: "Service Name", Value: mon.service},
|
|
{Name: "Time", Value: strutils.FormatTime(time.Now())},
|
|
}
|
|
if mon.url.Load() != nil {
|
|
extras.Add("Service URL", mon.url.Load().String())
|
|
}
|
|
if result.Detail != "" {
|
|
extras.Add("Detail", result.Detail)
|
|
}
|
|
return extras
|
|
}
|