From ca7362e9aa240053cacce2dd505e3e74218d9fba Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Mon, 16 Mar 2026 17:07:23 +0000 Subject: [PATCH] hscontrol/servertest: add control plane lifecycle and consistency tests Add three test files exercising the servertest harness: - lifecycle_test.go: connection, disconnection, reconnection, session replacement, and mesh formation at various sizes. - consistency_test.go: symmetric visibility, consistent peer state, address presence, concurrent join/leave convergence. - weather_test.go: rapid reconnects, flapping stability, reconnect with various delays, concurrent reconnects, and scale tests. All tests use table-driven patterns with subtests. --- hscontrol/servertest/consistency_test.go | 109 ++++++++++++++++ hscontrol/servertest/lifecycle_test.go | 91 ++++++++++++++ hscontrol/servertest/server.go | 8 ++ hscontrol/servertest/weather_test.go | 154 +++++++++++++++++++++++ 4 files changed, 362 insertions(+) create mode 100644 hscontrol/servertest/consistency_test.go create mode 100644 hscontrol/servertest/lifecycle_test.go create mode 100644 hscontrol/servertest/weather_test.go diff --git a/hscontrol/servertest/consistency_test.go b/hscontrol/servertest/consistency_test.go new file mode 100644 index 00000000..27c359ce --- /dev/null +++ b/hscontrol/servertest/consistency_test.go @@ -0,0 +1,109 @@ +package servertest_test + +import ( + "sync" + "testing" + "time" + + "github.com/juanfont/headscale/hscontrol/servertest" + "github.com/stretchr/testify/assert" +) + +// TestConsistency verifies that all nodes converge to the same +// view of the network and that no updates are lost during various +// operations. +func TestConsistency(t *testing.T) { + t.Parallel() + + t.Run("all_nodes_converge", func(t *testing.T) { + t.Parallel() + h := servertest.NewHarness(t, 5) + servertest.AssertMeshComplete(t, h.Clients()) + servertest.AssertConsistentState(t, h.Clients()) + servertest.AssertSymmetricVisibility(t, h.Clients()) + }) + + t.Run("self_node_has_correct_hostname", func(t *testing.T) { + t.Parallel() + + h := servertest.NewHarness(t, 3) + for _, c := range h.Clients() { + assert.Equal(t, c.Name, c.SelfName(), + "client %s self name should match", c.Name) + } + }) + + t.Run("update_count_positive", func(t *testing.T) { + t.Parallel() + h := servertest.NewHarness(t, 3) + // After mesh formation, each client should have received + // at least one update. + for _, c := range h.Clients() { + assert.Positive(t, c.UpdateCount(), + "client %s should have received at least one update", c.Name) + } + }) + + t.Run("new_node_visible_to_all", func(t *testing.T) { + t.Parallel() + h := servertest.NewHarness(t, 3) + + newClient := h.AddClient(t) + h.WaitForMeshComplete(t, 10*time.Second) + + // Verify every original client sees the new node. + for _, c := range h.Clients() { + if c == newClient { + continue + } + + _, found := c.PeerByName(newClient.Name) + assert.True(t, found, + "client %s should see new client %s", c.Name, newClient.Name) + } + + // And the new node sees all others. + for _, c := range h.Clients() { + if c == newClient { + continue + } + + _, found := newClient.PeerByName(c.Name) + assert.True(t, found, + "new client %s should see %s", newClient.Name, c.Name) + } + }) + + t.Run("concurrent_join_and_leave", func(t *testing.T) { + t.Parallel() + h := servertest.NewHarness(t, 5) + + var wg sync.WaitGroup + + // 3 nodes joining concurrently. + for range 3 { + wg.Go(func() { + h.AddClient(t) + }) + } + + // 2 nodes leaving concurrently. + for i := range 2 { + wg.Add(1) + + c := h.Client(i) + + go func() { + defer wg.Done() + + c.Disconnect(t) + }() + } + + wg.Wait() + + // After all churn, connected clients should converge. + servertest.EventuallyAssertMeshComplete(t, h.ConnectedClients(), 30*time.Second) + servertest.AssertConsistentState(t, h.ConnectedClients()) + }) +} diff --git a/hscontrol/servertest/lifecycle_test.go b/hscontrol/servertest/lifecycle_test.go new file mode 100644 index 00000000..0930b51a --- /dev/null +++ b/hscontrol/servertest/lifecycle_test.go @@ -0,0 +1,91 @@ +package servertest_test + +import ( + "fmt" + "testing" + "time" + + "github.com/juanfont/headscale/hscontrol/servertest" + "github.com/stretchr/testify/assert" +) + +// TestConnectionLifecycle exercises the core node lifecycle: +// connecting, seeing peers, joining mid-session, departing, and +// reconnecting. +func TestConnectionLifecycle(t *testing.T) { + t.Parallel() + + t.Run("single_node", func(t *testing.T) { + t.Parallel() + h := servertest.NewHarness(t, 1) + nm := h.Client(0).Netmap() + assert.NotNil(t, nm, "single node should receive a netmap") + assert.Empty(t, nm.Peers, "single node should have no peers") + }) + + t.Run("new_node_joins_mesh", func(t *testing.T) { + t.Parallel() + h := servertest.NewHarness(t, 3) + + // Add a 4th client mid-test. + h.AddClient(t) + h.WaitForMeshComplete(t, 10*time.Second) + servertest.AssertMeshComplete(t, h.Clients()) + servertest.AssertSymmetricVisibility(t, h.Clients()) + }) + + t.Run("node_departs_peers_update", func(t *testing.T) { + t.Parallel() + h := servertest.NewHarness(t, 3) + + departingName := h.Client(2).Name + h.Client(2).Disconnect(t) + + // The remaining clients should eventually stop seeing the + // departed node (after the grace period). + assert.Eventually(t, func() bool { + _, found := h.Client(0).PeerByName(departingName) + return !found + }, 30*time.Second, 500*time.Millisecond, + "client 0 should stop seeing departed node") + }) + + t.Run("reconnect_restores_mesh", func(t *testing.T) { + t.Parallel() + h := servertest.NewHarness(t, 2) + + // Disconnect and reconnect. + h.Client(0).Disconnect(t) + h.Client(0).Reconnect(t) + + // Mesh should recover. + h.WaitForMeshComplete(t, 15*time.Second) + servertest.AssertMeshComplete(t, h.Clients()) + }) + + t.Run("session_replacement", func(t *testing.T) { + t.Parallel() + h := servertest.NewHarness(t, 2) + + // Reconnect without explicitly waiting for the old session to + // fully drain. This tests that Headscale correctly replaces + // the old map session for the same node. + h.Client(0).Reconnect(t) + h.WaitForMeshComplete(t, 15*time.Second) + servertest.AssertMeshComplete(t, h.Clients()) + }) + + t.Run("multiple_nodes_join_sequentially", func(t *testing.T) { + t.Parallel() + + sizes := []int{2, 5, 10} + for _, n := range sizes { + t.Run(fmt.Sprintf("%d_nodes", n), func(t *testing.T) { + t.Parallel() + h := servertest.NewHarness(t, n) + servertest.AssertMeshComplete(t, h.Clients()) + servertest.AssertSymmetricVisibility(t, h.Clients()) + }) + } + }) +} diff --git a/hscontrol/servertest/server.go b/hscontrol/servertest/server.go index d9c0b85a..981b7234 100644 --- a/hscontrol/servertest/server.go +++ b/hscontrol/servertest/server.go @@ -6,6 +6,7 @@ package servertest import ( "net/http/httptest" + "net/netip" "testing" "time" @@ -37,6 +38,7 @@ type serverConfig struct { func defaultServerConfig() *serverConfig { return &serverConfig{ batchDelay: 50 * time.Millisecond, + bufferedChanSize: 30, batcherWorkers: 1, ephemeralTimeout: 30 * time.Second, } @@ -70,11 +72,17 @@ func NewServer(tb testing.TB, opts ...ServerOption) *TestServer { tmpDir := tb.TempDir() + prefixV4 := netip.MustParsePrefix("100.64.0.0/10") + prefixV6 := netip.MustParsePrefix("fd7a:115c:a1e0::/48") + cfg := types.Config{ // Placeholder; updated below once httptest server starts. ServerURL: "http://localhost:0", NoisePrivateKeyPath: tmpDir + "/noise_private.key", EphemeralNodeInactivityTimeout: sc.ephemeralTimeout, + PrefixV4: &prefixV4, + PrefixV6: &prefixV6, + IPAllocation: types.IPAllocationStrategySequential, Database: types.DatabaseConfig{ Type: "sqlite3", Sqlite: types.SqliteConfig{ diff --git a/hscontrol/servertest/weather_test.go b/hscontrol/servertest/weather_test.go new file mode 100644 index 00000000..5a8e33ee --- /dev/null +++ b/hscontrol/servertest/weather_test.go @@ -0,0 +1,154 @@ +package servertest_test + +import ( + "testing" + "time" + + "github.com/juanfont/headscale/hscontrol/servertest" + "github.com/stretchr/testify/assert" +) + +// TestNetworkWeather exercises scenarios that simulate unstable +// network conditions: rapid reconnects, disconnect/reconnect +// timing, and connection flapping. +func TestNetworkWeather(t *testing.T) { + t.Parallel() + + t.Run("rapid_reconnect_stays_online", func(t *testing.T) { + t.Parallel() + + h := servertest.NewHarness(t, 2) + + for range 10 { + h.Client(0).Disconnect(t) + h.Client(0).Reconnect(t) + } + + // After rapid flapping, mesh should still be complete. + h.WaitForMeshComplete(t, 15*time.Second) + servertest.AssertMeshComplete(t, h.Clients()) + }) + + t.Run("reconnect_within_grace_period", func(t *testing.T) { + t.Parallel() + h := servertest.NewHarness(t, 2) + + h.Client(0).Disconnect(t) + + // Reconnect quickly (well within the 10-second grace period). + h.Client(0).ReconnectAfter(t, 1*time.Second) + h.WaitForMeshComplete(t, 15*time.Second) + + // Peer should see us as online after reconnection. + servertest.AssertPeerOnline(t, h.Client(1), h.Client(0).Name) + }) + + t.Run("disconnect_types", func(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + disconnect func(c *servertest.TestClient, tb testing.TB) + }{ + {"clean_disconnect", (*servertest.TestClient).Disconnect}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + h := servertest.NewHarness(t, 2) + + tc.disconnect(h.Client(1), t) + + // The remaining client should eventually see peer gone/offline. + assert.Eventually(t, func() bool { + _, found := h.Client(0).PeerByName(h.Client(1).Name) + if found { + // If still in peer list, check if it's marked offline. + isOnline, known := func() (bool, bool) { + peer, ok := h.Client(0).PeerByName(h.Client(1).Name) + if !ok { + return false, false + } + + return peer.Online().GetOk() + }() + // Either unknown or offline is acceptable. + return known && !isOnline + } + + return true // peer gone + }, 30*time.Second, 500*time.Millisecond, + "peer should become offline or disappear") + }) + } + }) + + t.Run("state_consistent_through_reconnection", func(t *testing.T) { + t.Parallel() + h := servertest.NewHarness(t, 3) + + // Disconnect and reconnect the middle node. + h.Client(1).Disconnect(t) + h.Client(1).Reconnect(t) + + // Wait for convergence and verify consistency. + h.WaitForMeshComplete(t, 15*time.Second) + servertest.AssertConsistentState(t, h.Clients()) + }) + + t.Run("multiple_reconnect_delays", func(t *testing.T) { + t.Parallel() + + delays := []struct { + name string + delay time.Duration + }{ + {"immediate", 0}, + {"100ms", 100 * time.Millisecond}, + {"500ms", 500 * time.Millisecond}, + {"1s", 1 * time.Second}, + } + for _, tc := range delays { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + h := servertest.NewHarness(t, 2) + + if tc.delay > 0 { + h.Client(0).ReconnectAfter(t, tc.delay) + } else { + h.Client(0).Disconnect(t) + h.Client(0).Reconnect(t) + } + + h.WaitForMeshComplete(t, 15*time.Second) + servertest.AssertMeshComplete(t, h.Clients()) + }) + } + }) + + t.Run("flapping_does_not_leak_goroutines", func(t *testing.T) { + t.Parallel() + h := servertest.NewHarness(t, 2) + + // Do many rapid disconnect/reconnect cycles. + for i := range 20 { + h.Client(0).Disconnect(t) + h.Client(0).Reconnect(t) + + if i%5 == 0 { + t.Logf("flap cycle %d: %s has %d peers", + i, h.Client(0).Name, len(h.Client(0).Peers())) + } + } + + // Mesh should still be working. + h.WaitForMeshComplete(t, 15*time.Second) + servertest.AssertMeshComplete(t, h.Clients()) + }) + + t.Run("scale_20_nodes", func(t *testing.T) { + t.Parallel() + h := servertest.NewHarness(t, 20) + servertest.AssertMeshComplete(t, h.Clients()) + }) +}