Files
headscale/hscontrol/servertest/poll_race_test.go
Kristoffer Dalby 00c41b6422 hscontrol/servertest: add race, stress, and poll race tests
Add three test files designed to stress the control plane under
concurrent and adversarial conditions:

- race_test.go: 14 tests exercising concurrent mutations, session
  replacement, batcher contention, NodeStore access, and map response
  delivery during disconnect. All pass the Go race detector.

- poll_race_test.go: 8 tests targeting the poll.go grace period
  interleaving. These confirm a logical TOCTOU race: when a node
  disconnects and reconnects within the grace period, the old
  session's deferred Disconnect() can overwrite the new session's
  Connect(), leaving IsOnline=false despite an active poll session.

- stress_test.go: sustained churn, rapid mutations, rolling
  replacement, data integrity checks under load, and verification
  that rapid reconnects do not leak false-offline notifications.

Known failing tests (grace period TOCTOU race):
- server_state_online_after_reconnect_within_grace
- update_history_no_false_offline
- rapid_reconnect_peer_never_sees_offline
2026-03-19 07:05:58 +01:00

376 lines
10 KiB
Go

package servertest_test
import (
"fmt"
"net/netip"
"testing"
"time"
"github.com/juanfont/headscale/hscontrol/servertest"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"tailscale.com/types/netmap"
)
// TestPollRace targets logical race conditions specifically in the
// poll.go session lifecycle and the batcher's handling of concurrent
// sessions for the same node.
func TestPollRace(t *testing.T) {
t.Parallel()
// The core race: when a node disconnects, poll.go starts a
// grace period goroutine (10s ticker loop). If the node
// reconnects during this period, the new session calls
// Connect() to mark the node online. But the old grace period
// goroutine is still running and may call Disconnect() AFTER
// the new Connect(), setting IsOnline=false incorrectly.
//
// This test verifies the exact symptom: after reconnect within
// the grace period, the server-side node state should be online.
t.Run("server_state_online_after_reconnect_within_grace", func(t *testing.T) {
t.Parallel()
srv := servertest.NewServer(t)
user := srv.CreateUser(t, "gracerace-user")
c1 := servertest.NewClient(t, srv, "gracerace-node1",
servertest.WithUser(user))
servertest.NewClient(t, srv, "gracerace-node2",
servertest.WithUser(user))
c1.WaitForPeers(t, 1, 10*time.Second)
nodeID := findNodeID(t, srv, "gracerace-node1")
// Disconnect and immediately reconnect.
c1.Disconnect(t)
c1.Reconnect(t)
c1.WaitForPeers(t, 1, 15*time.Second)
// Check server-side state immediately.
nv, ok := srv.State().GetNodeByID(nodeID)
require.True(t, ok)
isOnline, known := nv.IsOnline().GetOk()
assert.True(t, known,
"server should know online status after reconnect")
assert.True(t, isOnline,
"server should show node as online after reconnect within grace period")
})
// Same test but wait a few seconds after reconnect. The old
// grace period goroutine may still be running.
t.Run("server_state_online_2s_after_reconnect", func(t *testing.T) {
t.Parallel()
srv := servertest.NewServer(t)
user := srv.CreateUser(t, "gracewait-user")
c1 := servertest.NewClient(t, srv, "gracewait-node1",
servertest.WithUser(user))
servertest.NewClient(t, srv, "gracewait-node2",
servertest.WithUser(user))
c1.WaitForPeers(t, 1, 10*time.Second)
nodeID := findNodeID(t, srv, "gracewait-node1")
c1.Disconnect(t)
c1.Reconnect(t)
c1.WaitForPeers(t, 1, 15*time.Second)
// Wait 2 seconds for the old grace period to potentially fire.
timer := time.NewTimer(2 * time.Second)
defer timer.Stop()
<-timer.C
nv, ok := srv.State().GetNodeByID(nodeID)
require.True(t, ok)
isOnline, known := nv.IsOnline().GetOk()
assert.True(t, known,
"server should know online status 2s after reconnect")
assert.True(t, isOnline,
"server should STILL show node as online 2s after reconnect (grace period goroutine should not overwrite)")
})
// Wait the full grace period (10s) after reconnect. The old
// grace period goroutine should have checked IsConnected
// and found the node connected, so should NOT have called
// Disconnect().
t.Run("server_state_online_12s_after_reconnect", func(t *testing.T) {
t.Parallel()
srv := servertest.NewServer(t)
user := srv.CreateUser(t, "gracelong-user")
c1 := servertest.NewClient(t, srv, "gracelong-node1",
servertest.WithUser(user))
servertest.NewClient(t, srv, "gracelong-node2",
servertest.WithUser(user))
c1.WaitForPeers(t, 1, 10*time.Second)
nodeID := findNodeID(t, srv, "gracelong-node1")
c1.Disconnect(t)
c1.Reconnect(t)
c1.WaitForPeers(t, 1, 15*time.Second)
// Wait past the full grace period.
timer := time.NewTimer(12 * time.Second)
defer timer.Stop()
<-timer.C
nv, ok := srv.State().GetNodeByID(nodeID)
require.True(t, ok)
isOnline, known := nv.IsOnline().GetOk()
assert.True(t, known,
"server should know online status after grace period expires")
assert.True(t, isOnline,
"server should show node as online after grace period -- the reconnect should have prevented the Disconnect() call")
})
// Peer's view: after rapid reconnect, the peer should see
// the reconnected node as online, not offline.
t.Run("peer_sees_online_after_rapid_reconnect", func(t *testing.T) {
t.Parallel()
srv := servertest.NewServer(t)
user := srv.CreateUser(t, "peeronl-user")
c1 := servertest.NewClient(t, srv, "peeronl-node1",
servertest.WithUser(user))
c2 := servertest.NewClient(t, srv, "peeronl-node2",
servertest.WithUser(user))
c1.WaitForPeers(t, 1, 10*time.Second)
// Wait for online status to propagate first.
c2.WaitForCondition(t, "peer initially online",
15*time.Second,
func(nm *netmap.NetworkMap) bool {
for _, p := range nm.Peers {
hi := p.Hostinfo()
if hi.Valid() && hi.Hostname() == "peeronl-node1" {
isOnline, known := p.Online().GetOk()
return known && isOnline
}
}
return false
})
// Rapid reconnect.
c1.Disconnect(t)
c1.Reconnect(t)
c1.WaitForPeers(t, 1, 15*time.Second)
// Wait 3 seconds for any stale updates to propagate.
timer := time.NewTimer(3 * time.Second)
defer timer.Stop()
<-timer.C
// At this point, c2 should see c1 as ONLINE.
// If the grace period race is present, c2 might
// temporarily see offline and then online again.
nm := c2.Netmap()
require.NotNil(t, nm)
for _, p := range nm.Peers {
hi := p.Hostinfo()
if hi.Valid() && hi.Hostname() == "peeronl-node1" {
isOnline, known := p.Online().GetOk()
assert.True(t, known,
"peer online status should be known")
assert.True(t, isOnline,
"peer should be online 3s after rapid reconnect")
}
}
})
// The batcher's IsConnected check: when the grace period
// goroutine calls IsConnected(), it should return true if
// a new session has been added for the same node.
t.Run("batcher_knows_reconnected_during_grace", func(t *testing.T) {
t.Parallel()
srv := servertest.NewServer(t)
user := srv.CreateUser(t, "batchknow-user")
c1 := servertest.NewClient(t, srv, "batchknow-node1",
servertest.WithUser(user))
c2 := servertest.NewClient(t, srv, "batchknow-node2",
servertest.WithUser(user))
c1.WaitForPeers(t, 1, 10*time.Second)
c2.WaitForPeers(t, 1, 10*time.Second)
// Disconnect and reconnect.
c1.Disconnect(t)
c1.Reconnect(t)
c1.WaitForPeers(t, 1, 15*time.Second)
// The mesh should be complete with both nodes seeing
// each other as online.
c2.WaitForCondition(t, "c1 online after reconnect",
15*time.Second,
func(nm *netmap.NetworkMap) bool {
for _, p := range nm.Peers {
hi := p.Hostinfo()
if hi.Valid() && hi.Hostname() == "batchknow-node1" {
isOnline, known := p.Online().GetOk()
return known && isOnline
}
}
return false
})
})
// Test that the update history shows a clean transition:
// the peer should never appear in the history with
// online=false if the reconnect was fast enough.
t.Run("update_history_no_false_offline", func(t *testing.T) {
t.Parallel()
srv := servertest.NewServer(t)
user := srv.CreateUser(t, "histroff-user")
c1 := servertest.NewClient(t, srv, "histroff-node1",
servertest.WithUser(user))
c2 := servertest.NewClient(t, srv, "histroff-node2",
servertest.WithUser(user))
c1.WaitForPeers(t, 1, 10*time.Second)
c2.WaitForPeers(t, 1, 10*time.Second)
// Record c2's update count before reconnect.
countBefore := c2.UpdateCount()
// Rapid reconnect.
c1.Disconnect(t)
c1.Reconnect(t)
c1.WaitForPeers(t, 1, 15*time.Second)
// Wait a moment for all updates to arrive.
timer := time.NewTimer(3 * time.Second)
defer timer.Stop()
<-timer.C
// Check c2's update history for any false offline.
history := c2.History()
sawOffline := false
for i := countBefore; i < len(history); i++ {
nm := history[i]
for _, p := range nm.Peers {
hi := p.Hostinfo()
if hi.Valid() && hi.Hostname() == "histroff-node1" {
isOnline, known := p.Online().GetOk()
if known && !isOnline {
sawOffline = true
t.Logf("update %d: saw peer offline (should not happen during rapid reconnect)", i)
}
}
}
}
assert.False(t, sawOffline,
"peer should never appear offline in update history during rapid reconnect")
})
// Multiple rapid reconnects should not cause the peer count
// to be wrong. After N reconnects, the reconnecting node should
// still see the right number of peers and vice versa.
t.Run("peer_count_stable_after_many_reconnects", func(t *testing.T) {
t.Parallel()
srv := servertest.NewServer(t)
user := srv.CreateUser(t, "peercount-user")
const n = 4
clients := make([]*servertest.TestClient, n)
for i := range n {
clients[i] = servertest.NewClient(t, srv,
fmt.Sprintf("peercount-%d", i),
servertest.WithUser(user))
}
for _, c := range clients {
c.WaitForPeers(t, n-1, 20*time.Second)
}
// Reconnect client 0 five times.
for range 5 {
clients[0].Disconnect(t)
clients[0].Reconnect(t)
}
// All clients should still see n-1 peers.
for _, c := range clients {
c.WaitForPeers(t, n-1, 15*time.Second)
}
servertest.AssertMeshComplete(t, clients)
})
// Route approval during reconnect: approve a route while a
// node is reconnecting. Both the reconnecting node and peers
// should eventually see the correct state.
t.Run("route_approval_during_reconnect", func(t *testing.T) {
t.Parallel()
srv := servertest.NewServer(t)
user := srv.CreateUser(t, "rtrecon-user")
c1 := servertest.NewClient(t, srv, "rtrecon-node1",
servertest.WithUser(user))
servertest.NewClient(t, srv, "rtrecon-node2",
servertest.WithUser(user))
c1.WaitForPeers(t, 1, 10*time.Second)
nodeID1 := findNodeID(t, srv, "rtrecon-node1")
// Disconnect c1.
c1.Disconnect(t)
// While c1 is disconnected, approve a route for it.
route := netip.MustParsePrefix("10.55.0.0/24")
_, routeChange, err := srv.State().SetApprovedRoutes(
nodeID1, []netip.Prefix{route})
require.NoError(t, err)
srv.App.Change(routeChange)
// Reconnect c1.
c1.Reconnect(t)
c1.WaitForPeers(t, 1, 15*time.Second)
// c1 should receive a self-update with the new route.
c1.WaitForCondition(t, "self-update after route+reconnect",
10*time.Second,
func(nm *netmap.NetworkMap) bool {
return nm != nil && nm.SelfNode.Valid()
})
// Verify server state is correct.
nv, ok := srv.State().GetNodeByID(nodeID1)
require.True(t, ok)
routes := nv.ApprovedRoutes().AsSlice()
assert.Contains(t, routes, route,
"approved route should persist through reconnect")
})
}