mirror of
https://github.com/juanfont/headscale.git
synced 2026-04-23 17:18:50 +02:00
hscontrol/poll,state: fix grace period disconnect TOCTOU race
When a node disconnects, serveLongPoll defers a cleanup that starts a grace period goroutine. This goroutine polls batcher.IsConnected() and, if the node has not reconnected within ~10 seconds, calls state.Disconnect() to mark it offline. A TOCTOU race exists: the node can reconnect (calling Connect()) between the IsConnected check and the Disconnect() call, causing the stale Disconnect() to overwrite the new session's online status. Fix with a monotonic per-node generation counter: - State.Connect() increments the counter and returns the current generation alongside the change list. - State.Disconnect() accepts the generation from the caller and rejects the call if a newer generation exists, making stale disconnects from old sessions a no-op. - serveLongPoll captures the generation at Connect() time and passes it to Disconnect() in the deferred cleanup. - RemoveNode's return value is now checked: if another session already owns the batcher slot (reconnect happened), the old session skips the grace period entirely. Update batcher_test.go to track per-node connect generations and pass them through to Disconnect(), matching production behavior. Fixes the following test failures: - server_state_online_after_reconnect_within_grace - update_history_no_false_offline - nodestore_correct_after_rapid_reconnect - rapid_reconnect_peer_never_sees_offline
This commit is contained in:
@@ -39,14 +39,20 @@ type testBatcherWrapper struct {
|
||||
*Batcher
|
||||
|
||||
state *state.State
|
||||
|
||||
// connectGens tracks per-node connect generations so RemoveNode can pass
|
||||
// the correct generation to State.Disconnect(), matching production behavior.
|
||||
connectGens sync.Map // types.NodeID → uint64
|
||||
}
|
||||
|
||||
func (t *testBatcherWrapper) AddNode(id types.NodeID, c chan<- *tailcfg.MapResponse, version tailcfg.CapabilityVersion, stop func()) error {
|
||||
// Mark node as online in state before AddNode to match production behavior
|
||||
// This ensures the NodeStore has correct online status for change processing
|
||||
if t.state != nil {
|
||||
// Use Connect to properly mark node online in NodeStore but don't send its changes
|
||||
_ = t.state.Connect(id)
|
||||
// Use Connect to properly mark node online in NodeStore and track the
|
||||
// generation so RemoveNode can pass it to Disconnect().
|
||||
_, gen := t.state.Connect(id)
|
||||
t.connectGens.Store(id, gen)
|
||||
}
|
||||
|
||||
// First add the node to the real batcher
|
||||
@@ -71,8 +77,15 @@ func (t *testBatcherWrapper) RemoveNode(id types.NodeID, c chan<- *tailcfg.MapRe
|
||||
// Mark node as offline in state BEFORE removing from batcher
|
||||
// This ensures the NodeStore has correct offline status when the change is processed
|
||||
if t.state != nil {
|
||||
// Use Disconnect to properly mark node offline in NodeStore but don't send its changes
|
||||
_, _ = t.state.Disconnect(id)
|
||||
var gen uint64
|
||||
|
||||
if v, ok := t.connectGens.LoadAndDelete(id); ok {
|
||||
if g, ok := v.(uint64); ok {
|
||||
gen = g
|
||||
}
|
||||
}
|
||||
|
||||
_, _ = t.state.Disconnect(id, gen)
|
||||
}
|
||||
|
||||
// Send the offline notification that poll.go would normally send
|
||||
|
||||
Reference in New Issue
Block a user