Files
godoxy-yusing/internal/health/monitor/monitor.go
yusing 3001417a37 fix(health): only send recovery notification after down notification
Previously, up notifications were sent whenever a service recovered,
even if no down notification had been sent (e.g., when recovering
before the failure threshold was met). This could confuse users who
would receive "service is up" notifications without ever being
notified of a problem.

Now, recovery notifications are only sent when a prior down
notification exists, ensuring notification pairs are always complete.
2026-02-23 11:05:19 +08:00

311 lines
7.9 KiB
Go

package monitor
import (
"context"
"errors"
"fmt"
"math/rand"
"net/url"
"strings"
"sync/atomic"
"time"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
config "github.com/yusing/godoxy/internal/config/types"
"github.com/yusing/godoxy/internal/notif"
"github.com/yusing/godoxy/internal/types"
"github.com/yusing/goutils/events"
strutils "github.com/yusing/goutils/strings"
"github.com/yusing/goutils/synk"
"github.com/yusing/goutils/task"
)
type (
DisplayNameKey struct{}
HealthCheckFunc func(url *url.URL) (result types.HealthCheckResult, err error)
monitor struct {
service string
config types.HealthCheckConfig
url synk.Value[*url.URL]
onUpdateURL func(url *url.URL)
status synk.Value[types.HealthStatus]
lastResult synk.Value[types.HealthCheckResult]
checkHealth HealthCheckFunc
startTime time.Time
notifyFunc notif.NotifyFunc
numConsecFailures atomic.Int64
downNotificationSent atomic.Bool
task *task.Task
}
)
var ErrNegativeInterval = errors.New("negative interval")
func (mon *monitor) init(u *url.URL, cfg types.HealthCheckConfig, healthCheckFunc HealthCheckFunc) {
if state := config.WorkingState.Load(); state != nil {
cfg.ApplyDefaults(state.Value().Defaults.HealthCheck)
} else {
cfg.ApplyDefaults(types.HealthCheckConfig{}) // use defaults from constants
}
mon.config = cfg
mon.checkHealth = healthCheckFunc
mon.startTime = time.Now()
mon.notifyFunc = notif.Notify
mon.status.Store(types.StatusHealthy)
mon.lastResult.Store(types.HealthCheckResult{Healthy: true, Detail: "started"})
if u == nil {
mon.url.Store(&url.URL{})
} else {
mon.url.Store(u)
}
}
func (mon *monitor) Context() context.Context {
if mon.config.BaseContext != nil {
return mon.config.BaseContext()
}
if mon.task != nil {
return mon.task.Context()
}
return context.Background()
}
func (mon *monitor) CheckHealth() (types.HealthCheckResult, error) {
return mon.checkHealth(mon.url.Load())
}
// Start implements task.TaskStarter.
func (mon *monitor) Start(parent task.Parent) error {
if mon.config.Interval <= 0 {
return ErrNegativeInterval
}
mon.task = parent.Subtask("health_monitor", true)
mon.service = parent.Name()
if displayName, ok := parent.GetValue(DisplayNameKey{}).(string); ok {
mon.service = displayName
}
go func() {
logger := log.With().Str("name", mon.service).Logger()
defer func() {
if mon.status.Load() != types.StatusError {
mon.status.Store(types.StatusUnhealthy)
}
mon.task.Finish(nil)
}()
failures := 0
if err := mon.checkUpdateHealth(); err != nil {
logger.Err(err).Msg("healthchecker error")
failures++
}
// add a random delay between 0 and 10 seconds to avoid thundering herd
time.Sleep(time.Duration(rand.Intn(10)) * time.Second)
ticker := time.NewTicker(mon.config.Interval)
defer ticker.Stop()
for {
select {
case <-mon.task.Context().Done():
return
case <-ticker.C:
err := mon.checkUpdateHealth()
if err != nil {
logger.Err(err).Msg("healthchecker error")
failures++
} else {
failures = 0
}
if failures >= 5 {
mon.status.Store(types.StatusError)
mon.task.Finish(err)
logger.Error().Msg("healthchecker stopped after 5 trials")
return
}
}
}
}()
return nil
}
// Task implements task.TaskStarter.
func (mon *monitor) Task() *task.Task {
return mon.task
}
// Finish implements task.TaskFinisher.
func (mon *monitor) Finish(reason any) {
if mon.task != nil {
mon.task.Finish(reason)
}
}
// UpdateURL implements HealthChecker.
func (mon *monitor) UpdateURL(url *url.URL) {
if url == nil {
log.Warn().Msg("attempting to update health monitor URL with nil")
return
}
mon.url.Store(url)
if mon.onUpdateURL != nil {
mon.onUpdateURL(url)
}
}
// URL implements HealthChecker.
func (mon *monitor) URL() *url.URL {
return mon.url.Load()
}
// Config implements HealthChecker.
func (mon *monitor) Config() *types.HealthCheckConfig {
return &mon.config
}
// Status implements HealthMonitor.
func (mon *monitor) Status() types.HealthStatus {
return mon.status.Load()
}
// Uptime implements HealthMonitor.
func (mon *monitor) Uptime() time.Duration {
return time.Since(mon.startTime)
}
// Latency implements HealthMonitor.
func (mon *monitor) Latency() time.Duration {
res := mon.lastResult.Load()
return res.Latency
}
// Detail implements HealthMonitor.
func (mon *monitor) Detail() string {
res := mon.lastResult.Load()
return res.Detail
}
// Name implements HealthMonitor.
func (mon *monitor) Name() string {
parts := strings.Split(mon.service, "/")
return parts[len(parts)-1]
}
// String implements fmt.Stringer of HealthMonitor.
func (mon *monitor) String() string {
return mon.Name()
}
// MarshalJSON implements health.HealthMonitor.
func (mon *monitor) MarshalJSON() ([]byte, error) {
res := mon.lastResult.Load()
return (&types.HealthJSONRepr{
Name: mon.service,
Config: &mon.config,
Status: mon.status.Load(),
Started: mon.startTime,
Uptime: mon.Uptime(),
Latency: res.Latency,
LastSeen: GetLastSeen(mon.service),
Detail: res.Detail,
URL: mon.url.Load(),
}).MarshalJSON()
}
func (mon *monitor) checkUpdateHealth() error {
logger := log.With().Str("name", mon.Name()).Logger()
result, err := mon.checkHealth(mon.url.Load())
var lastStatus types.HealthStatus
switch {
case err != nil:
result = types.HealthCheckResult{Healthy: false, Detail: err.Error()}
lastStatus = mon.status.Swap(types.StatusError)
case result.Healthy:
lastStatus = mon.status.Swap(types.StatusHealthy)
UpdateLastSeen(mon.service)
default:
lastStatus = mon.status.Swap(types.StatusUnhealthy)
}
mon.lastResult.Store(result)
// change of status
if result.Healthy != (lastStatus == types.StatusHealthy) {
if result.Healthy {
mon.numConsecFailures.Store(0)
if mon.downNotificationSent.Load() { // Only notify if the service down has been notified
mon.notifyServiceUp(&logger, &result)
mon.downNotificationSent.Store(false) // Reset notification state when service comes back up
}
} else if mon.config.Retries < 0 {
// immediate notification when retries < 0
mon.notifyServiceDown(&logger, &result)
mon.downNotificationSent.Store(true)
}
}
// if threshold >= 0, notify after threshold consecutive failures (but only once)
if !result.Healthy && mon.config.Retries >= 0 {
failureCount := mon.numConsecFailures.Add(1)
if failureCount >= mon.config.Retries && !mon.downNotificationSent.Load() {
mon.notifyServiceDown(&logger, &result)
mon.downNotificationSent.Store(true)
}
}
return err
}
func (mon *monitor) notifyServiceUp(logger *zerolog.Logger, result *types.HealthCheckResult) {
logger.Info().Msg("service is up")
extras := mon.buildNotificationExtras(result)
extras.Add("Ping", fmt.Sprintf("%d ms", result.Latency.Milliseconds()))
mon.notifyFunc(&notif.LogMessage{
Level: zerolog.InfoLevel,
Title: "✅ Service is up ✅",
Body: extras,
Color: notif.ColorSuccess,
})
events.Global.Add(events.NewEvent(events.LevelInfo, "health", "service_up", mon))
}
func (mon *monitor) notifyServiceDown(logger *zerolog.Logger, result *types.HealthCheckResult) {
logger.Warn().Str("detail", result.Detail).Msg("service went down")
extras := mon.buildNotificationExtras(result)
extras.Add("Last Seen", strutils.FormatLastSeen(GetLastSeen(mon.service)))
mon.notifyFunc(&notif.LogMessage{
Level: zerolog.WarnLevel,
Title: "❌ Service went down ❌",
Body: extras,
Color: notif.ColorError,
})
events.Global.Add(events.NewEvent(events.LevelWarn, "health", "service_down", mon))
}
func (mon *monitor) buildNotificationExtras(result *types.HealthCheckResult) notif.FieldsBody {
extras := notif.FieldsBody{
{Name: "Service Name", Value: mon.service},
{Name: "Time", Value: strutils.FormatTime(time.Now())},
}
if mon.url.Load() != nil {
extras.Add("Service URL", mon.url.Load().String())
}
if result.Detail != "" {
extras.Add("Detail", result.Detail)
}
return extras
}