mirror of
https://github.com/juanfont/headscale.git
synced 2026-04-11 03:27:20 +02:00
Add three test files designed to stress the control plane under concurrent and adversarial conditions: - race_test.go: 14 tests exercising concurrent mutations, session replacement, batcher contention, NodeStore access, and map response delivery during disconnect. All pass the Go race detector. - poll_race_test.go: 8 tests targeting the poll.go grace period interleaving. These confirm a logical TOCTOU race: when a node disconnects and reconnects within the grace period, the old session's deferred Disconnect() can overwrite the new session's Connect(), leaving IsOnline=false despite an active poll session. - stress_test.go: sustained churn, rapid mutations, rolling replacement, data integrity checks under load, and verification that rapid reconnects do not leak false-offline notifications. Known failing tests (grace period TOCTOU race): - server_state_online_after_reconnect_within_grace - update_history_no_false_offline - rapid_reconnect_peer_never_sees_offline
376 lines
10 KiB
Go
376 lines
10 KiB
Go
package servertest_test
|
|
|
|
import (
|
|
"fmt"
|
|
"net/netip"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/juanfont/headscale/hscontrol/servertest"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
"tailscale.com/types/netmap"
|
|
)
|
|
|
|
// TestPollRace targets logical race conditions specifically in the
|
|
// poll.go session lifecycle and the batcher's handling of concurrent
|
|
// sessions for the same node.
|
|
|
|
func TestPollRace(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// The core race: when a node disconnects, poll.go starts a
|
|
// grace period goroutine (10s ticker loop). If the node
|
|
// reconnects during this period, the new session calls
|
|
// Connect() to mark the node online. But the old grace period
|
|
// goroutine is still running and may call Disconnect() AFTER
|
|
// the new Connect(), setting IsOnline=false incorrectly.
|
|
//
|
|
// This test verifies the exact symptom: after reconnect within
|
|
// the grace period, the server-side node state should be online.
|
|
t.Run("server_state_online_after_reconnect_within_grace", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
srv := servertest.NewServer(t)
|
|
user := srv.CreateUser(t, "gracerace-user")
|
|
|
|
c1 := servertest.NewClient(t, srv, "gracerace-node1",
|
|
servertest.WithUser(user))
|
|
servertest.NewClient(t, srv, "gracerace-node2",
|
|
servertest.WithUser(user))
|
|
|
|
c1.WaitForPeers(t, 1, 10*time.Second)
|
|
|
|
nodeID := findNodeID(t, srv, "gracerace-node1")
|
|
|
|
// Disconnect and immediately reconnect.
|
|
c1.Disconnect(t)
|
|
c1.Reconnect(t)
|
|
c1.WaitForPeers(t, 1, 15*time.Second)
|
|
|
|
// Check server-side state immediately.
|
|
nv, ok := srv.State().GetNodeByID(nodeID)
|
|
require.True(t, ok)
|
|
|
|
isOnline, known := nv.IsOnline().GetOk()
|
|
assert.True(t, known,
|
|
"server should know online status after reconnect")
|
|
assert.True(t, isOnline,
|
|
"server should show node as online after reconnect within grace period")
|
|
})
|
|
|
|
// Same test but wait a few seconds after reconnect. The old
|
|
// grace period goroutine may still be running.
|
|
t.Run("server_state_online_2s_after_reconnect", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
srv := servertest.NewServer(t)
|
|
user := srv.CreateUser(t, "gracewait-user")
|
|
|
|
c1 := servertest.NewClient(t, srv, "gracewait-node1",
|
|
servertest.WithUser(user))
|
|
servertest.NewClient(t, srv, "gracewait-node2",
|
|
servertest.WithUser(user))
|
|
|
|
c1.WaitForPeers(t, 1, 10*time.Second)
|
|
|
|
nodeID := findNodeID(t, srv, "gracewait-node1")
|
|
|
|
c1.Disconnect(t)
|
|
c1.Reconnect(t)
|
|
c1.WaitForPeers(t, 1, 15*time.Second)
|
|
|
|
// Wait 2 seconds for the old grace period to potentially fire.
|
|
timer := time.NewTimer(2 * time.Second)
|
|
defer timer.Stop()
|
|
|
|
<-timer.C
|
|
|
|
nv, ok := srv.State().GetNodeByID(nodeID)
|
|
require.True(t, ok)
|
|
|
|
isOnline, known := nv.IsOnline().GetOk()
|
|
assert.True(t, known,
|
|
"server should know online status 2s after reconnect")
|
|
assert.True(t, isOnline,
|
|
"server should STILL show node as online 2s after reconnect (grace period goroutine should not overwrite)")
|
|
})
|
|
|
|
// Wait the full grace period (10s) after reconnect. The old
|
|
// grace period goroutine should have checked IsConnected
|
|
// and found the node connected, so should NOT have called
|
|
// Disconnect().
|
|
t.Run("server_state_online_12s_after_reconnect", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
srv := servertest.NewServer(t)
|
|
user := srv.CreateUser(t, "gracelong-user")
|
|
|
|
c1 := servertest.NewClient(t, srv, "gracelong-node1",
|
|
servertest.WithUser(user))
|
|
servertest.NewClient(t, srv, "gracelong-node2",
|
|
servertest.WithUser(user))
|
|
|
|
c1.WaitForPeers(t, 1, 10*time.Second)
|
|
|
|
nodeID := findNodeID(t, srv, "gracelong-node1")
|
|
|
|
c1.Disconnect(t)
|
|
c1.Reconnect(t)
|
|
c1.WaitForPeers(t, 1, 15*time.Second)
|
|
|
|
// Wait past the full grace period.
|
|
timer := time.NewTimer(12 * time.Second)
|
|
defer timer.Stop()
|
|
|
|
<-timer.C
|
|
|
|
nv, ok := srv.State().GetNodeByID(nodeID)
|
|
require.True(t, ok)
|
|
|
|
isOnline, known := nv.IsOnline().GetOk()
|
|
assert.True(t, known,
|
|
"server should know online status after grace period expires")
|
|
assert.True(t, isOnline,
|
|
"server should show node as online after grace period -- the reconnect should have prevented the Disconnect() call")
|
|
})
|
|
|
|
// Peer's view: after rapid reconnect, the peer should see
|
|
// the reconnected node as online, not offline.
|
|
t.Run("peer_sees_online_after_rapid_reconnect", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
srv := servertest.NewServer(t)
|
|
user := srv.CreateUser(t, "peeronl-user")
|
|
|
|
c1 := servertest.NewClient(t, srv, "peeronl-node1",
|
|
servertest.WithUser(user))
|
|
c2 := servertest.NewClient(t, srv, "peeronl-node2",
|
|
servertest.WithUser(user))
|
|
|
|
c1.WaitForPeers(t, 1, 10*time.Second)
|
|
|
|
// Wait for online status to propagate first.
|
|
c2.WaitForCondition(t, "peer initially online",
|
|
15*time.Second,
|
|
func(nm *netmap.NetworkMap) bool {
|
|
for _, p := range nm.Peers {
|
|
hi := p.Hostinfo()
|
|
if hi.Valid() && hi.Hostname() == "peeronl-node1" {
|
|
isOnline, known := p.Online().GetOk()
|
|
|
|
return known && isOnline
|
|
}
|
|
}
|
|
|
|
return false
|
|
})
|
|
|
|
// Rapid reconnect.
|
|
c1.Disconnect(t)
|
|
c1.Reconnect(t)
|
|
c1.WaitForPeers(t, 1, 15*time.Second)
|
|
|
|
// Wait 3 seconds for any stale updates to propagate.
|
|
timer := time.NewTimer(3 * time.Second)
|
|
defer timer.Stop()
|
|
|
|
<-timer.C
|
|
|
|
// At this point, c2 should see c1 as ONLINE.
|
|
// If the grace period race is present, c2 might
|
|
// temporarily see offline and then online again.
|
|
nm := c2.Netmap()
|
|
require.NotNil(t, nm)
|
|
|
|
for _, p := range nm.Peers {
|
|
hi := p.Hostinfo()
|
|
if hi.Valid() && hi.Hostname() == "peeronl-node1" {
|
|
isOnline, known := p.Online().GetOk()
|
|
assert.True(t, known,
|
|
"peer online status should be known")
|
|
assert.True(t, isOnline,
|
|
"peer should be online 3s after rapid reconnect")
|
|
}
|
|
}
|
|
})
|
|
|
|
// The batcher's IsConnected check: when the grace period
|
|
// goroutine calls IsConnected(), it should return true if
|
|
// a new session has been added for the same node.
|
|
t.Run("batcher_knows_reconnected_during_grace", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
srv := servertest.NewServer(t)
|
|
user := srv.CreateUser(t, "batchknow-user")
|
|
|
|
c1 := servertest.NewClient(t, srv, "batchknow-node1",
|
|
servertest.WithUser(user))
|
|
c2 := servertest.NewClient(t, srv, "batchknow-node2",
|
|
servertest.WithUser(user))
|
|
|
|
c1.WaitForPeers(t, 1, 10*time.Second)
|
|
c2.WaitForPeers(t, 1, 10*time.Second)
|
|
|
|
// Disconnect and reconnect.
|
|
c1.Disconnect(t)
|
|
c1.Reconnect(t)
|
|
c1.WaitForPeers(t, 1, 15*time.Second)
|
|
|
|
// The mesh should be complete with both nodes seeing
|
|
// each other as online.
|
|
c2.WaitForCondition(t, "c1 online after reconnect",
|
|
15*time.Second,
|
|
func(nm *netmap.NetworkMap) bool {
|
|
for _, p := range nm.Peers {
|
|
hi := p.Hostinfo()
|
|
if hi.Valid() && hi.Hostname() == "batchknow-node1" {
|
|
isOnline, known := p.Online().GetOk()
|
|
|
|
return known && isOnline
|
|
}
|
|
}
|
|
|
|
return false
|
|
})
|
|
})
|
|
|
|
// Test that the update history shows a clean transition:
|
|
// the peer should never appear in the history with
|
|
// online=false if the reconnect was fast enough.
|
|
t.Run("update_history_no_false_offline", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
srv := servertest.NewServer(t)
|
|
user := srv.CreateUser(t, "histroff-user")
|
|
|
|
c1 := servertest.NewClient(t, srv, "histroff-node1",
|
|
servertest.WithUser(user))
|
|
c2 := servertest.NewClient(t, srv, "histroff-node2",
|
|
servertest.WithUser(user))
|
|
|
|
c1.WaitForPeers(t, 1, 10*time.Second)
|
|
c2.WaitForPeers(t, 1, 10*time.Second)
|
|
|
|
// Record c2's update count before reconnect.
|
|
countBefore := c2.UpdateCount()
|
|
|
|
// Rapid reconnect.
|
|
c1.Disconnect(t)
|
|
c1.Reconnect(t)
|
|
c1.WaitForPeers(t, 1, 15*time.Second)
|
|
|
|
// Wait a moment for all updates to arrive.
|
|
timer := time.NewTimer(3 * time.Second)
|
|
defer timer.Stop()
|
|
|
|
<-timer.C
|
|
|
|
// Check c2's update history for any false offline.
|
|
history := c2.History()
|
|
sawOffline := false
|
|
|
|
for i := countBefore; i < len(history); i++ {
|
|
nm := history[i]
|
|
for _, p := range nm.Peers {
|
|
hi := p.Hostinfo()
|
|
if hi.Valid() && hi.Hostname() == "histroff-node1" {
|
|
isOnline, known := p.Online().GetOk()
|
|
if known && !isOnline {
|
|
sawOffline = true
|
|
|
|
t.Logf("update %d: saw peer offline (should not happen during rapid reconnect)", i)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
assert.False(t, sawOffline,
|
|
"peer should never appear offline in update history during rapid reconnect")
|
|
})
|
|
|
|
// Multiple rapid reconnects should not cause the peer count
|
|
// to be wrong. After N reconnects, the reconnecting node should
|
|
// still see the right number of peers and vice versa.
|
|
t.Run("peer_count_stable_after_many_reconnects", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
srv := servertest.NewServer(t)
|
|
user := srv.CreateUser(t, "peercount-user")
|
|
|
|
const n = 4
|
|
|
|
clients := make([]*servertest.TestClient, n)
|
|
for i := range n {
|
|
clients[i] = servertest.NewClient(t, srv,
|
|
fmt.Sprintf("peercount-%d", i),
|
|
servertest.WithUser(user))
|
|
}
|
|
|
|
for _, c := range clients {
|
|
c.WaitForPeers(t, n-1, 20*time.Second)
|
|
}
|
|
|
|
// Reconnect client 0 five times.
|
|
for range 5 {
|
|
clients[0].Disconnect(t)
|
|
clients[0].Reconnect(t)
|
|
}
|
|
|
|
// All clients should still see n-1 peers.
|
|
for _, c := range clients {
|
|
c.WaitForPeers(t, n-1, 15*time.Second)
|
|
}
|
|
|
|
servertest.AssertMeshComplete(t, clients)
|
|
})
|
|
|
|
// Route approval during reconnect: approve a route while a
|
|
// node is reconnecting. Both the reconnecting node and peers
|
|
// should eventually see the correct state.
|
|
t.Run("route_approval_during_reconnect", func(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
srv := servertest.NewServer(t)
|
|
user := srv.CreateUser(t, "rtrecon-user")
|
|
|
|
c1 := servertest.NewClient(t, srv, "rtrecon-node1",
|
|
servertest.WithUser(user))
|
|
servertest.NewClient(t, srv, "rtrecon-node2",
|
|
servertest.WithUser(user))
|
|
|
|
c1.WaitForPeers(t, 1, 10*time.Second)
|
|
|
|
nodeID1 := findNodeID(t, srv, "rtrecon-node1")
|
|
|
|
// Disconnect c1.
|
|
c1.Disconnect(t)
|
|
|
|
// While c1 is disconnected, approve a route for it.
|
|
route := netip.MustParsePrefix("10.55.0.0/24")
|
|
_, routeChange, err := srv.State().SetApprovedRoutes(
|
|
nodeID1, []netip.Prefix{route})
|
|
require.NoError(t, err)
|
|
srv.App.Change(routeChange)
|
|
|
|
// Reconnect c1.
|
|
c1.Reconnect(t)
|
|
c1.WaitForPeers(t, 1, 15*time.Second)
|
|
|
|
// c1 should receive a self-update with the new route.
|
|
c1.WaitForCondition(t, "self-update after route+reconnect",
|
|
10*time.Second,
|
|
func(nm *netmap.NetworkMap) bool {
|
|
return nm != nil && nm.SelfNode.Valid()
|
|
})
|
|
|
|
// Verify server state is correct.
|
|
nv, ok := srv.State().GetNodeByID(nodeID1)
|
|
require.True(t, ok)
|
|
|
|
routes := nv.ApprovedRoutes().AsSlice()
|
|
assert.Contains(t, routes, route,
|
|
"approved route should persist through reconnect")
|
|
})
|
|
}
|