Files
godoxy-yusing/internal/health/monitor/monitor_test.go
yusing 3001417a37 fix(health): only send recovery notification after down notification
Previously, up notifications were sent whenever a service recovered,
even if no down notification had been sent (e.g., when recovering
before the failure threshold was met). This could confuse users who
would receive "service is up" notifications without ever being
notified of a problem.

Now, recovery notifications are only sent when a prior down
notification exists, ensuring notification pairs are always complete.
2026-02-23 11:05:19 +08:00

316 lines
8.7 KiB
Go

package monitor
import (
"net/url"
"sync"
"testing"
"time"
"github.com/rs/zerolog"
"github.com/stretchr/testify/require"
"github.com/yusing/godoxy/internal/notif"
"github.com/yusing/godoxy/internal/types"
"github.com/yusing/goutils/task"
)
// Test notification tracker
type testNotificationTracker struct {
mu sync.RWMutex
upNotifications int
downNotifications int
lastNotification string
}
func (t *testNotificationTracker) getStats() (up, down int, last string) {
t.mu.RLock()
defer t.mu.RUnlock()
return t.upNotifications, t.downNotifications, t.lastNotification
}
// Create test monitor with mock health checker - returns both monitor and tracker
func createTestMonitor(config types.HealthCheckConfig, checkFunc HealthCheckFunc) (*monitor, *testNotificationTracker) {
testURL, _ := url.Parse("http://localhost:8080")
var mon monitor
mon.init(testURL, config, checkFunc)
// Override notification functions to track calls instead of actually notifying
tracker := &testNotificationTracker{}
mon.notifyFunc = func(msg *notif.LogMessage) {
tracker.mu.Lock()
defer tracker.mu.Unlock()
switch msg.Level {
case zerolog.InfoLevel:
tracker.upNotifications++
tracker.lastNotification = "up"
case zerolog.WarnLevel:
tracker.downNotifications++
tracker.lastNotification = "down"
default:
panic("unexpected log level: " + msg.Level.String())
}
}
return &mon, tracker
}
func TestNotification_ImmediateNotifyAfterZero(t *testing.T) {
config := types.HealthCheckConfig{
Interval: 100 * time.Millisecond,
Timeout: 50 * time.Millisecond,
Retries: -1, // Immediate notification
}
mon, tracker := createTestMonitor(config, func(u *url.URL) (types.HealthCheckResult, error) {
return types.HealthCheckResult{Healthy: true}, nil
})
// Start with healthy service
result, err := mon.checkHealth(nil)
require.NoError(t, err)
require.True(t, result.Healthy)
// Set to unhealthy
mon.checkHealth = func(u *url.URL) (types.HealthCheckResult, error) {
return types.HealthCheckResult{Healthy: false}, nil
}
// Simulate status change detection
err = mon.checkUpdateHealth()
require.NoError(t, err)
// With NotifyAfter=0, notification should happen immediately
require.Equal(t, types.StatusUnhealthy, mon.Status())
// Check notification counts - should have 1 down notification
up, down, last := tracker.getStats()
require.Equal(t, 1, down)
require.Equal(t, 0, up)
require.Equal(t, "down", last)
}
func TestNotification_WithNotifyAfterThreshold(t *testing.T) {
config := types.HealthCheckConfig{
Interval: 50 * time.Millisecond,
Timeout: 50 * time.Millisecond,
Retries: 2, // Notify after 2 consecutive failures
}
mon, tracker := createTestMonitor(config, func(u *url.URL) (types.HealthCheckResult, error) {
return types.HealthCheckResult{Healthy: true}, nil
})
// Start healthy
mon.status.Store(types.StatusHealthy)
// Set to unhealthy
mon.checkHealth = func(u *url.URL) (types.HealthCheckResult, error) {
return types.HealthCheckResult{Healthy: false}, nil
}
// First failure - should not notify yet
err := mon.checkUpdateHealth()
require.NoError(t, err)
// Should have no notifications yet (threshold not met)
up, down, _ := tracker.getStats()
require.Equal(t, 0, down)
require.Equal(t, 0, up)
// Second failure - should trigger notification
err = mon.checkUpdateHealth()
require.NoError(t, err)
// Now should have 1 down notification after threshold met
up, down, last := tracker.getStats()
require.Equal(t, 1, down)
require.Equal(t, 0, up)
require.Equal(t, "down", last)
}
func TestNotification_ServiceRecoversBeforeThreshold(t *testing.T) {
config := types.HealthCheckConfig{
Interval: 100 * time.Millisecond,
Timeout: 50 * time.Millisecond,
Retries: 3, // Notify after 3 consecutive failures
}
mon, tracker := createTestMonitor(config, func(u *url.URL) (types.HealthCheckResult, error) {
return types.HealthCheckResult{Healthy: true}, nil
})
// Start healthy
mon.status.Store(types.StatusHealthy)
// Set to unhealthy
mon.checkHealth = func(u *url.URL) (types.HealthCheckResult, error) {
return types.HealthCheckResult{Healthy: false}, nil
}
// First failure
err := mon.checkUpdateHealth()
require.NoError(t, err)
// Second failure
err = mon.checkUpdateHealth()
require.NoError(t, err)
// Should have no notifications yet
up, down, _ := tracker.getStats()
require.Equal(t, 0, down)
require.Equal(t, 0, up)
// Service recovers before third failure
mon.checkHealth = func(u *url.URL) (types.HealthCheckResult, error) {
return types.HealthCheckResult{Healthy: true}, nil
}
// Health check with recovery
err = mon.checkUpdateHealth()
require.NoError(t, err)
// Should have no notifications because threshold was never met.
// Recovery notification is only sent after a down notification was sent.
up, down, last := tracker.getStats()
require.Equal(t, 0, down)
require.Equal(t, 0, up)
require.Empty(t, last)
}
func TestNotification_ConsecutiveFailureReset(t *testing.T) {
config := types.HealthCheckConfig{
Interval: 100 * time.Millisecond,
Timeout: 50 * time.Millisecond,
Retries: 2, // Notify after 2 consecutive failures
}
mon, tracker := createTestMonitor(config, func(u *url.URL) (types.HealthCheckResult, error) {
return types.HealthCheckResult{Healthy: true}, nil
})
// Start healthy
mon.status.Store(types.StatusHealthy)
// Set to unhealthy
mon.checkHealth = func(u *url.URL) (types.HealthCheckResult, error) {
return types.HealthCheckResult{Healthy: false}, nil
}
// First failure
err := mon.checkUpdateHealth()
require.NoError(t, err)
// Recover briefly
mon.checkHealth = func(u *url.URL) (types.HealthCheckResult, error) {
return types.HealthCheckResult{Healthy: true}, nil
}
err = mon.checkUpdateHealth()
require.NoError(t, err)
// Should have no notifications, consecutive failures should reset.
// Recovery notification is only sent after a down notification was sent.
up, down, _ := tracker.getStats()
require.Equal(t, 0, down)
require.Equal(t, 0, up)
// Go down again - consecutive counter should start from 0
mon.checkHealth = func(u *url.URL) (types.HealthCheckResult, error) {
return types.HealthCheckResult{Healthy: false}, nil
}
// First failure after recovery
err = mon.checkUpdateHealth()
require.NoError(t, err)
// Should still have no down notifications (need 2 consecutive)
up, down, _ = tracker.getStats()
require.Equal(t, 0, down)
require.Equal(t, 0, up)
// Second consecutive failure - should trigger notification
err = mon.checkUpdateHealth()
require.NoError(t, err)
// Now should have down notification
up, down, last := tracker.getStats()
require.Equal(t, 1, down)
require.Equal(t, 0, up)
require.Equal(t, "down", last)
}
func TestNotification_ContextCancellation(t *testing.T) {
config := types.HealthCheckConfig{
Interval: 100 * time.Millisecond,
Timeout: 50 * time.Millisecond,
Retries: 1,
}
mon, tracker := createTestMonitor(config, func(u *url.URL) (types.HealthCheckResult, error) {
return types.HealthCheckResult{Healthy: true}, nil
})
// Create a task that we can cancel
rootTask := task.RootTask("test", true)
mon.task = rootTask.Subtask("monitor", true)
// Start healthy, then go unhealthy
mon.status.Store(types.StatusHealthy)
mon.checkHealth = func(u *url.URL) (types.HealthCheckResult, error) {
return types.HealthCheckResult{Healthy: false}, nil
}
// Trigger notification
err := mon.checkUpdateHealth()
require.NoError(t, err)
// Should have down notification
up, down, _ := tracker.getStats()
require.Equal(t, 1, down)
require.Equal(t, 0, up)
// Cancel the task context
rootTask.Finish(nil)
// Context cancellation doesn't affect notifications that already happened
up, down, _ = tracker.getStats()
require.Equal(t, 1, down)
require.Equal(t, 0, up)
}
func TestImmediateUpNotificationAfterDownNotification(t *testing.T) {
config := types.HealthCheckConfig{
Interval: 100 * time.Millisecond,
Timeout: 50 * time.Millisecond,
Retries: 2,
}
mon, tracker := createTestMonitor(config, func(u *url.URL) (types.HealthCheckResult, error) {
return types.HealthCheckResult{Healthy: false}, nil
})
// Start unhealthy
mon.status.Store(types.StatusUnhealthy)
mon.downNotificationSent.Store(true)
// Set to healthy
mon.checkHealth = func(u *url.URL) (types.HealthCheckResult, error) {
return types.HealthCheckResult{Healthy: true, Latency: 50 * time.Millisecond}, nil
}
// Trigger health check
err := mon.checkUpdateHealth()
require.NoError(t, err)
// Up notification should happen immediately once a prior down notification exists.
require.Equal(t, types.StatusHealthy, mon.Status())
// Should have exactly 1 up notification immediately
up, down, last := tracker.getStats()
require.Equal(t, 1, up)
require.Equal(t, 0, down)
require.Equal(t, "up", last)
}