mirror of
https://github.com/juanfont/headscale.git
synced 2026-04-23 00:58:43 +02:00
batcher: ensure removal from batcher
Fixes #2924 Signed-off-by: Kristoffer Dalby <kristoffer@dalby.cc>
This commit is contained in:
@@ -2635,3 +2635,140 @@ func TestBatcherMultiConnection(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestNodeDeletedWhileChangesPending reproduces issue #2924 where deleting a node
|
||||
// from state while there are pending changes for that node in the batcher causes
|
||||
// "node not found" errors. The race condition occurs when:
|
||||
// 1. Node is connected and changes are queued for it
|
||||
// 2. Node is deleted from state (NodeStore) but not from batcher
|
||||
// 3. Batcher worker tries to generate map response for deleted node
|
||||
// 4. Mapper fails to find node in state, causing repeated "node not found" errors.
|
||||
func TestNodeDeletedWhileChangesPending(t *testing.T) {
|
||||
for _, batcherFunc := range allBatcherFunctions {
|
||||
t.Run(batcherFunc.name, func(t *testing.T) {
|
||||
// Create test environment with 3 nodes
|
||||
testData, cleanup := setupBatcherWithTestData(t, batcherFunc.fn, 1, 3, NORMAL_BUFFER_SIZE)
|
||||
defer cleanup()
|
||||
|
||||
batcher := testData.Batcher
|
||||
st := testData.State
|
||||
node1 := &testData.Nodes[0]
|
||||
node2 := &testData.Nodes[1]
|
||||
node3 := &testData.Nodes[2]
|
||||
|
||||
t.Logf("Testing issue #2924: Node1=%d, Node2=%d, Node3=%d",
|
||||
node1.n.ID, node2.n.ID, node3.n.ID)
|
||||
|
||||
// Helper to drain channels
|
||||
drainCh := func(ch chan *tailcfg.MapResponse) {
|
||||
for {
|
||||
select {
|
||||
case <-ch:
|
||||
// drain
|
||||
default:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Start update consumers for all nodes
|
||||
node1.start()
|
||||
node2.start()
|
||||
node3.start()
|
||||
|
||||
defer node1.cleanup()
|
||||
defer node2.cleanup()
|
||||
defer node3.cleanup()
|
||||
|
||||
// Connect all nodes to the batcher
|
||||
require.NoError(t, batcher.AddNode(node1.n.ID, node1.ch, tailcfg.CapabilityVersion(100)))
|
||||
require.NoError(t, batcher.AddNode(node2.n.ID, node2.ch, tailcfg.CapabilityVersion(100)))
|
||||
require.NoError(t, batcher.AddNode(node3.n.ID, node3.ch, tailcfg.CapabilityVersion(100)))
|
||||
|
||||
// Wait for all nodes to be connected
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
assert.True(c, batcher.IsConnected(node1.n.ID), "node1 should be connected")
|
||||
assert.True(c, batcher.IsConnected(node2.n.ID), "node2 should be connected")
|
||||
assert.True(c, batcher.IsConnected(node3.n.ID), "node3 should be connected")
|
||||
}, 5*time.Second, 50*time.Millisecond, "waiting for nodes to connect")
|
||||
|
||||
// Get initial work errors count
|
||||
var initialWorkErrors int64
|
||||
if lfb, ok := unwrapBatcher(batcher).(*LockFreeBatcher); ok {
|
||||
initialWorkErrors = lfb.WorkErrors()
|
||||
t.Logf("Initial work errors: %d", initialWorkErrors)
|
||||
}
|
||||
|
||||
// Clear channels to prepare for the test
|
||||
drainCh(node1.ch)
|
||||
drainCh(node2.ch)
|
||||
drainCh(node3.ch)
|
||||
|
||||
// Get node view for deletion
|
||||
nodeToDelete, ok := st.GetNodeByID(node3.n.ID)
|
||||
require.True(t, ok, "node3 should exist in state")
|
||||
|
||||
// Delete the node from state - this returns a NodeRemoved change
|
||||
// In production, this change is sent to batcher via app.Change()
|
||||
nodeChange, err := st.DeleteNode(nodeToDelete)
|
||||
require.NoError(t, err, "should be able to delete node from state")
|
||||
t.Logf("Deleted node %d from state, change: %s", node3.n.ID, nodeChange.Reason)
|
||||
|
||||
// Verify node is deleted from state
|
||||
_, exists := st.GetNodeByID(node3.n.ID)
|
||||
require.False(t, exists, "node3 should be deleted from state")
|
||||
|
||||
// Send the NodeRemoved change to batcher (this is what app.Change() does)
|
||||
// With the fix, this should clean up node3 from batcher's internal state
|
||||
batcher.AddWork(nodeChange)
|
||||
|
||||
// Wait for the batcher to process the removal and clean up the node
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
assert.False(c, batcher.IsConnected(node3.n.ID), "node3 should be disconnected from batcher")
|
||||
}, 5*time.Second, 50*time.Millisecond, "waiting for node removal to be processed")
|
||||
|
||||
t.Logf("Node %d connected in batcher after NodeRemoved: %v", node3.n.ID, batcher.IsConnected(node3.n.ID))
|
||||
|
||||
// Now queue changes that would have caused errors before the fix
|
||||
// With the fix, these should NOT cause "node not found" errors
|
||||
// because node3 was cleaned up when NodeRemoved was processed
|
||||
batcher.AddWork(change.FullUpdate())
|
||||
batcher.AddWork(change.PolicyChange())
|
||||
|
||||
// Wait for work to be processed and verify no errors occurred
|
||||
// With the fix, no new errors should occur because the deleted node
|
||||
// was cleaned up from batcher state when NodeRemoved was processed
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
var finalWorkErrors int64
|
||||
if lfb, ok := unwrapBatcher(batcher).(*LockFreeBatcher); ok {
|
||||
finalWorkErrors = lfb.WorkErrors()
|
||||
}
|
||||
|
||||
newErrors := finalWorkErrors - initialWorkErrors
|
||||
assert.Zero(c, newErrors, "Fix for #2924: should have no work errors after node deletion")
|
||||
}, 5*time.Second, 100*time.Millisecond, "waiting for work processing to complete without errors")
|
||||
|
||||
// Verify remaining nodes still work correctly
|
||||
drainCh(node1.ch)
|
||||
drainCh(node2.ch)
|
||||
batcher.AddWork(change.NodeAdded(node1.n.ID))
|
||||
|
||||
assert.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
// Node 1 and 2 should receive updates
|
||||
stats1 := NodeStats{TotalUpdates: atomic.LoadInt64(&node1.updateCount)}
|
||||
stats2 := NodeStats{TotalUpdates: atomic.LoadInt64(&node2.updateCount)}
|
||||
assert.Positive(c, stats1.TotalUpdates, "node1 should have received updates")
|
||||
assert.Positive(c, stats2.TotalUpdates, "node2 should have received updates")
|
||||
}, 5*time.Second, 100*time.Millisecond, "waiting for remaining nodes to receive updates")
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// unwrapBatcher extracts the underlying batcher from wrapper types.
|
||||
func unwrapBatcher(b Batcher) Batcher {
|
||||
if wrapper, ok := b.(*testBatcherWrapper); ok {
|
||||
return unwrapBatcher(wrapper.Batcher)
|
||||
}
|
||||
|
||||
return b
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user