hscontrol/servertest: add control plane lifecycle and consistency tests

Add three test files exercising the servertest harness:

- lifecycle_test.go: connection, disconnection, reconnection, session
  replacement, and mesh formation at various sizes.
- consistency_test.go: symmetric visibility, consistent peer state,
  address presence, concurrent join/leave convergence.
- weather_test.go: rapid reconnects, flapping stability, reconnect
  with various delays, concurrent reconnects, and scale tests.

All tests use table-driven patterns with subtests.
This commit is contained in:
Kristoffer Dalby
2026-03-16 17:07:23 +00:00
parent 0288614bdf
commit ca7362e9aa
4 changed files with 362 additions and 0 deletions

View File

@@ -0,0 +1,109 @@
package servertest_test
import (
"sync"
"testing"
"time"
"github.com/juanfont/headscale/hscontrol/servertest"
"github.com/stretchr/testify/assert"
)
// TestConsistency verifies that all nodes converge to the same
// view of the network and that no updates are lost during various
// operations.
func TestConsistency(t *testing.T) {
t.Parallel()
t.Run("all_nodes_converge", func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, 5)
servertest.AssertMeshComplete(t, h.Clients())
servertest.AssertConsistentState(t, h.Clients())
servertest.AssertSymmetricVisibility(t, h.Clients())
})
t.Run("self_node_has_correct_hostname", func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, 3)
for _, c := range h.Clients() {
assert.Equal(t, c.Name, c.SelfName(),
"client %s self name should match", c.Name)
}
})
t.Run("update_count_positive", func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, 3)
// After mesh formation, each client should have received
// at least one update.
for _, c := range h.Clients() {
assert.Positive(t, c.UpdateCount(),
"client %s should have received at least one update", c.Name)
}
})
t.Run("new_node_visible_to_all", func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, 3)
newClient := h.AddClient(t)
h.WaitForMeshComplete(t, 10*time.Second)
// Verify every original client sees the new node.
for _, c := range h.Clients() {
if c == newClient {
continue
}
_, found := c.PeerByName(newClient.Name)
assert.True(t, found,
"client %s should see new client %s", c.Name, newClient.Name)
}
// And the new node sees all others.
for _, c := range h.Clients() {
if c == newClient {
continue
}
_, found := newClient.PeerByName(c.Name)
assert.True(t, found,
"new client %s should see %s", newClient.Name, c.Name)
}
})
t.Run("concurrent_join_and_leave", func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, 5)
var wg sync.WaitGroup
// 3 nodes joining concurrently.
for range 3 {
wg.Go(func() {
h.AddClient(t)
})
}
// 2 nodes leaving concurrently.
for i := range 2 {
wg.Add(1)
c := h.Client(i)
go func() {
defer wg.Done()
c.Disconnect(t)
}()
}
wg.Wait()
// After all churn, connected clients should converge.
servertest.EventuallyAssertMeshComplete(t, h.ConnectedClients(), 30*time.Second)
servertest.AssertConsistentState(t, h.ConnectedClients())
})
}

View File

@@ -0,0 +1,91 @@
package servertest_test
import (
"fmt"
"testing"
"time"
"github.com/juanfont/headscale/hscontrol/servertest"
"github.com/stretchr/testify/assert"
)
// TestConnectionLifecycle exercises the core node lifecycle:
// connecting, seeing peers, joining mid-session, departing, and
// reconnecting.
func TestConnectionLifecycle(t *testing.T) {
t.Parallel()
t.Run("single_node", func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, 1)
nm := h.Client(0).Netmap()
assert.NotNil(t, nm, "single node should receive a netmap")
assert.Empty(t, nm.Peers, "single node should have no peers")
})
t.Run("new_node_joins_mesh", func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, 3)
// Add a 4th client mid-test.
h.AddClient(t)
h.WaitForMeshComplete(t, 10*time.Second)
servertest.AssertMeshComplete(t, h.Clients())
servertest.AssertSymmetricVisibility(t, h.Clients())
})
t.Run("node_departs_peers_update", func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, 3)
departingName := h.Client(2).Name
h.Client(2).Disconnect(t)
// The remaining clients should eventually stop seeing the
// departed node (after the grace period).
assert.Eventually(t, func() bool {
_, found := h.Client(0).PeerByName(departingName)
return !found
}, 30*time.Second, 500*time.Millisecond,
"client 0 should stop seeing departed node")
})
t.Run("reconnect_restores_mesh", func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, 2)
// Disconnect and reconnect.
h.Client(0).Disconnect(t)
h.Client(0).Reconnect(t)
// Mesh should recover.
h.WaitForMeshComplete(t, 15*time.Second)
servertest.AssertMeshComplete(t, h.Clients())
})
t.Run("session_replacement", func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, 2)
// Reconnect without explicitly waiting for the old session to
// fully drain. This tests that Headscale correctly replaces
// the old map session for the same node.
h.Client(0).Reconnect(t)
h.WaitForMeshComplete(t, 15*time.Second)
servertest.AssertMeshComplete(t, h.Clients())
})
t.Run("multiple_nodes_join_sequentially", func(t *testing.T) {
t.Parallel()
sizes := []int{2, 5, 10}
for _, n := range sizes {
t.Run(fmt.Sprintf("%d_nodes", n), func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, n)
servertest.AssertMeshComplete(t, h.Clients())
servertest.AssertSymmetricVisibility(t, h.Clients())
})
}
})
}

View File

@@ -6,6 +6,7 @@ package servertest
import (
"net/http/httptest"
"net/netip"
"testing"
"time"
@@ -37,6 +38,7 @@ type serverConfig struct {
func defaultServerConfig() *serverConfig {
return &serverConfig{
batchDelay: 50 * time.Millisecond,
bufferedChanSize: 30,
batcherWorkers: 1,
ephemeralTimeout: 30 * time.Second,
}
@@ -70,11 +72,17 @@ func NewServer(tb testing.TB, opts ...ServerOption) *TestServer {
tmpDir := tb.TempDir()
prefixV4 := netip.MustParsePrefix("100.64.0.0/10")
prefixV6 := netip.MustParsePrefix("fd7a:115c:a1e0::/48")
cfg := types.Config{
// Placeholder; updated below once httptest server starts.
ServerURL: "http://localhost:0",
NoisePrivateKeyPath: tmpDir + "/noise_private.key",
EphemeralNodeInactivityTimeout: sc.ephemeralTimeout,
PrefixV4: &prefixV4,
PrefixV6: &prefixV6,
IPAllocation: types.IPAllocationStrategySequential,
Database: types.DatabaseConfig{
Type: "sqlite3",
Sqlite: types.SqliteConfig{

View File

@@ -0,0 +1,154 @@
package servertest_test
import (
"testing"
"time"
"github.com/juanfont/headscale/hscontrol/servertest"
"github.com/stretchr/testify/assert"
)
// TestNetworkWeather exercises scenarios that simulate unstable
// network conditions: rapid reconnects, disconnect/reconnect
// timing, and connection flapping.
func TestNetworkWeather(t *testing.T) {
t.Parallel()
t.Run("rapid_reconnect_stays_online", func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, 2)
for range 10 {
h.Client(0).Disconnect(t)
h.Client(0).Reconnect(t)
}
// After rapid flapping, mesh should still be complete.
h.WaitForMeshComplete(t, 15*time.Second)
servertest.AssertMeshComplete(t, h.Clients())
})
t.Run("reconnect_within_grace_period", func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, 2)
h.Client(0).Disconnect(t)
// Reconnect quickly (well within the 10-second grace period).
h.Client(0).ReconnectAfter(t, 1*time.Second)
h.WaitForMeshComplete(t, 15*time.Second)
// Peer should see us as online after reconnection.
servertest.AssertPeerOnline(t, h.Client(1), h.Client(0).Name)
})
t.Run("disconnect_types", func(t *testing.T) {
t.Parallel()
cases := []struct {
name string
disconnect func(c *servertest.TestClient, tb testing.TB)
}{
{"clean_disconnect", (*servertest.TestClient).Disconnect},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, 2)
tc.disconnect(h.Client(1), t)
// The remaining client should eventually see peer gone/offline.
assert.Eventually(t, func() bool {
_, found := h.Client(0).PeerByName(h.Client(1).Name)
if found {
// If still in peer list, check if it's marked offline.
isOnline, known := func() (bool, bool) {
peer, ok := h.Client(0).PeerByName(h.Client(1).Name)
if !ok {
return false, false
}
return peer.Online().GetOk()
}()
// Either unknown or offline is acceptable.
return known && !isOnline
}
return true // peer gone
}, 30*time.Second, 500*time.Millisecond,
"peer should become offline or disappear")
})
}
})
t.Run("state_consistent_through_reconnection", func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, 3)
// Disconnect and reconnect the middle node.
h.Client(1).Disconnect(t)
h.Client(1).Reconnect(t)
// Wait for convergence and verify consistency.
h.WaitForMeshComplete(t, 15*time.Second)
servertest.AssertConsistentState(t, h.Clients())
})
t.Run("multiple_reconnect_delays", func(t *testing.T) {
t.Parallel()
delays := []struct {
name string
delay time.Duration
}{
{"immediate", 0},
{"100ms", 100 * time.Millisecond},
{"500ms", 500 * time.Millisecond},
{"1s", 1 * time.Second},
}
for _, tc := range delays {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, 2)
if tc.delay > 0 {
h.Client(0).ReconnectAfter(t, tc.delay)
} else {
h.Client(0).Disconnect(t)
h.Client(0).Reconnect(t)
}
h.WaitForMeshComplete(t, 15*time.Second)
servertest.AssertMeshComplete(t, h.Clients())
})
}
})
t.Run("flapping_does_not_leak_goroutines", func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, 2)
// Do many rapid disconnect/reconnect cycles.
for i := range 20 {
h.Client(0).Disconnect(t)
h.Client(0).Reconnect(t)
if i%5 == 0 {
t.Logf("flap cycle %d: %s has %d peers",
i, h.Client(0).Name, len(h.Client(0).Peers()))
}
}
// Mesh should still be working.
h.WaitForMeshComplete(t, 15*time.Second)
servertest.AssertMeshComplete(t, h.Clients())
})
t.Run("scale_20_nodes", func(t *testing.T) {
t.Parallel()
h := servertest.NewHarness(t, 20)
servertest.AssertMeshComplete(t, h.Clients())
})
}