mirror of
https://github.com/yusing/godoxy.git
synced 2026-03-31 14:13:09 +02:00
This is a large-scale refactoring across the codebase that replaces the custom `gperr.Error` type with Go's standard `error` interface. The changes include: - Replacing `gperr.Error` return types with `error` in function signatures - Using `errors.New()` and `fmt.Errorf()` instead of `gperr.New()` and `gperr.Errorf()` - Using `%w` format verb for error wrapping instead of `.With()` method - Replacing `gperr.Subject()` calls with `gperr.PrependSubject()` - Converting error logging from `gperr.Log*()` functions to zerolog's `.Err().Msg()` pattern - Update NewLogger to handle multiline error message - Updating `goutils` submodule to latest commit This refactoring aligns with Go idioms and removes the dependency on custom error handling abstractions in favor of standard library patterns.
299 lines
7.5 KiB
Go
299 lines
7.5 KiB
Go
package monitor
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"math/rand"
|
|
"net/url"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/rs/zerolog"
|
|
"github.com/rs/zerolog/log"
|
|
config "github.com/yusing/godoxy/internal/config/types"
|
|
"github.com/yusing/godoxy/internal/notif"
|
|
"github.com/yusing/godoxy/internal/types"
|
|
strutils "github.com/yusing/goutils/strings"
|
|
"github.com/yusing/goutils/synk"
|
|
"github.com/yusing/goutils/task"
|
|
)
|
|
|
|
type (
|
|
HealthCheckFunc func(url *url.URL) (result types.HealthCheckResult, err error)
|
|
monitor struct {
|
|
service string
|
|
config types.HealthCheckConfig
|
|
url synk.Value[*url.URL]
|
|
|
|
onUpdateURL func(url *url.URL)
|
|
|
|
status synk.Value[types.HealthStatus]
|
|
lastResult synk.Value[types.HealthCheckResult]
|
|
|
|
checkHealth HealthCheckFunc
|
|
startTime time.Time
|
|
|
|
notifyFunc notif.NotifyFunc
|
|
numConsecFailures atomic.Int64
|
|
downNotificationSent atomic.Bool
|
|
|
|
task *task.Task
|
|
}
|
|
)
|
|
|
|
var ErrNegativeInterval = errors.New("negative interval")
|
|
|
|
func (mon *monitor) init(u *url.URL, cfg types.HealthCheckConfig, healthCheckFunc HealthCheckFunc) {
|
|
if state := config.WorkingState.Load(); state != nil {
|
|
cfg.ApplyDefaults(state.Value().Defaults.HealthCheck)
|
|
} else {
|
|
cfg.ApplyDefaults(types.HealthCheckConfig{}) // use defaults from constants
|
|
}
|
|
mon.config = cfg
|
|
mon.checkHealth = healthCheckFunc
|
|
mon.startTime = time.Now()
|
|
mon.notifyFunc = notif.Notify
|
|
mon.status.Store(types.StatusHealthy)
|
|
mon.lastResult.Store(types.HealthCheckResult{Healthy: true, Detail: "started"})
|
|
|
|
if u == nil {
|
|
mon.url.Store(&url.URL{})
|
|
} else {
|
|
mon.url.Store(u)
|
|
}
|
|
}
|
|
|
|
func (mon *monitor) Context() context.Context {
|
|
if mon.config.BaseContext != nil {
|
|
return mon.config.BaseContext()
|
|
}
|
|
if mon.task != nil {
|
|
return mon.task.Context()
|
|
}
|
|
return context.Background()
|
|
}
|
|
|
|
func (mon *monitor) CheckHealth() (types.HealthCheckResult, error) {
|
|
return mon.checkHealth(mon.url.Load())
|
|
}
|
|
|
|
// Start implements task.TaskStarter.
|
|
func (mon *monitor) Start(parent task.Parent) error {
|
|
if mon.config.Interval <= 0 {
|
|
return ErrNegativeInterval
|
|
}
|
|
|
|
mon.service = parent.Name()
|
|
mon.task = parent.Subtask("health_monitor", true)
|
|
|
|
go func() {
|
|
logger := log.With().Str("name", mon.service).Logger()
|
|
|
|
defer func() {
|
|
if mon.status.Load() != types.StatusError {
|
|
mon.status.Store(types.StatusUnhealthy)
|
|
}
|
|
mon.task.Finish(nil)
|
|
}()
|
|
|
|
failures := 0
|
|
|
|
if err := mon.checkUpdateHealth(); err != nil {
|
|
logger.Err(err).Msg("healthchecker error")
|
|
failures++
|
|
}
|
|
|
|
// add a random delay between 0 and 10 seconds to avoid thundering herd
|
|
time.Sleep(time.Duration(rand.Intn(10)) * time.Second)
|
|
|
|
ticker := time.NewTicker(mon.config.Interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-mon.task.Context().Done():
|
|
return
|
|
case <-ticker.C:
|
|
err := mon.checkUpdateHealth()
|
|
if err != nil {
|
|
logger.Err(err).Msg("healthchecker error")
|
|
failures++
|
|
} else {
|
|
failures = 0
|
|
}
|
|
if failures >= 5 {
|
|
mon.status.Store(types.StatusError)
|
|
mon.task.Finish(err)
|
|
logger.Error().Msg("healthchecker stopped after 5 trials")
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}()
|
|
return nil
|
|
}
|
|
|
|
// Task implements task.TaskStarter.
|
|
func (mon *monitor) Task() *task.Task {
|
|
return mon.task
|
|
}
|
|
|
|
// Finish implements task.TaskFinisher.
|
|
func (mon *monitor) Finish(reason any) {
|
|
if mon.task != nil {
|
|
mon.task.Finish(reason)
|
|
}
|
|
}
|
|
|
|
// UpdateURL implements HealthChecker.
|
|
func (mon *monitor) UpdateURL(url *url.URL) {
|
|
if url == nil {
|
|
log.Warn().Msg("attempting to update health monitor URL with nil")
|
|
return
|
|
}
|
|
mon.url.Store(url)
|
|
if mon.onUpdateURL != nil {
|
|
mon.onUpdateURL(url)
|
|
}
|
|
}
|
|
|
|
// URL implements HealthChecker.
|
|
func (mon *monitor) URL() *url.URL {
|
|
return mon.url.Load()
|
|
}
|
|
|
|
// Config implements HealthChecker.
|
|
func (mon *monitor) Config() *types.HealthCheckConfig {
|
|
return &mon.config
|
|
}
|
|
|
|
// Status implements HealthMonitor.
|
|
func (mon *monitor) Status() types.HealthStatus {
|
|
return mon.status.Load()
|
|
}
|
|
|
|
// Uptime implements HealthMonitor.
|
|
func (mon *monitor) Uptime() time.Duration {
|
|
return time.Since(mon.startTime)
|
|
}
|
|
|
|
// Latency implements HealthMonitor.
|
|
func (mon *monitor) Latency() time.Duration {
|
|
res := mon.lastResult.Load()
|
|
return res.Latency
|
|
}
|
|
|
|
// Detail implements HealthMonitor.
|
|
func (mon *monitor) Detail() string {
|
|
res := mon.lastResult.Load()
|
|
return res.Detail
|
|
}
|
|
|
|
// Name implements HealthMonitor.
|
|
func (mon *monitor) Name() string {
|
|
parts := strutils.SplitRune(mon.service, '/')
|
|
return parts[len(parts)-1]
|
|
}
|
|
|
|
// String implements fmt.Stringer of HealthMonitor.
|
|
func (mon *monitor) String() string {
|
|
return mon.Name()
|
|
}
|
|
|
|
// MarshalJSON implements health.HealthMonitor.
|
|
func (mon *monitor) MarshalJSON() ([]byte, error) {
|
|
res := mon.lastResult.Load()
|
|
return (&types.HealthJSONRepr{
|
|
Name: mon.service,
|
|
Config: &mon.config,
|
|
Status: mon.status.Load(),
|
|
Started: mon.startTime,
|
|
Uptime: mon.Uptime(),
|
|
Latency: res.Latency,
|
|
LastSeen: GetLastSeen(mon.service),
|
|
Detail: res.Detail,
|
|
URL: mon.url.Load(),
|
|
}).MarshalJSON()
|
|
}
|
|
|
|
func (mon *monitor) checkUpdateHealth() error {
|
|
logger := log.With().Str("name", mon.Name()).Logger()
|
|
result, err := mon.checkHealth(mon.url.Load())
|
|
|
|
var lastStatus types.HealthStatus
|
|
switch {
|
|
case err != nil:
|
|
result = types.HealthCheckResult{Healthy: false, Detail: err.Error()}
|
|
lastStatus = mon.status.Swap(types.StatusError)
|
|
case result.Healthy:
|
|
lastStatus = mon.status.Swap(types.StatusHealthy)
|
|
UpdateLastSeen(mon.service)
|
|
default:
|
|
lastStatus = mon.status.Swap(types.StatusUnhealthy)
|
|
}
|
|
mon.lastResult.Store(result)
|
|
|
|
// change of status
|
|
if result.Healthy != (lastStatus == types.StatusHealthy) {
|
|
if result.Healthy {
|
|
mon.notifyServiceUp(&logger, &result)
|
|
mon.numConsecFailures.Store(0)
|
|
mon.downNotificationSent.Store(false) // Reset notification state when service comes back up
|
|
} else if mon.config.Retries < 0 {
|
|
// immediate notification when retries < 0
|
|
mon.notifyServiceDown(&logger, &result)
|
|
mon.downNotificationSent.Store(true)
|
|
}
|
|
}
|
|
|
|
// if threshold >= 0, notify after threshold consecutive failures (but only once)
|
|
if !result.Healthy && mon.config.Retries >= 0 {
|
|
failureCount := mon.numConsecFailures.Add(1)
|
|
if failureCount >= mon.config.Retries && !mon.downNotificationSent.Load() {
|
|
mon.notifyServiceDown(&logger, &result)
|
|
mon.downNotificationSent.Store(true)
|
|
}
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
func (mon *monitor) notifyServiceUp(logger *zerolog.Logger, result *types.HealthCheckResult) {
|
|
logger.Info().Msg("service is up")
|
|
extras := mon.buildNotificationExtras(result)
|
|
extras.Add("Ping", fmt.Sprintf("%d ms", result.Latency.Milliseconds()))
|
|
mon.notifyFunc(¬if.LogMessage{
|
|
Level: zerolog.InfoLevel,
|
|
Title: "✅ Service is up ✅",
|
|
Body: extras,
|
|
Color: notif.ColorSuccess,
|
|
})
|
|
}
|
|
|
|
func (mon *monitor) notifyServiceDown(logger *zerolog.Logger, result *types.HealthCheckResult) {
|
|
logger.Warn().Str("detail", result.Detail).Msg("service went down")
|
|
extras := mon.buildNotificationExtras(result)
|
|
extras.Add("Last Seen", strutils.FormatLastSeen(GetLastSeen(mon.service)))
|
|
mon.notifyFunc(¬if.LogMessage{
|
|
Level: zerolog.WarnLevel,
|
|
Title: "❌ Service went down ❌",
|
|
Body: extras,
|
|
Color: notif.ColorError,
|
|
})
|
|
}
|
|
|
|
func (mon *monitor) buildNotificationExtras(result *types.HealthCheckResult) notif.FieldsBody {
|
|
extras := notif.FieldsBody{
|
|
{Name: "Service Name", Value: mon.service},
|
|
{Name: "Time", Value: strutils.FormatTime(time.Now())},
|
|
}
|
|
if mon.url.Load() != nil {
|
|
extras.Add("Service URL", mon.url.Load().String())
|
|
}
|
|
if result.Detail != "" {
|
|
extras.Add("Detail", result.Detail)
|
|
}
|
|
return extras
|
|
}
|