mirror of
https://github.com/juanfont/headscale.git
synced 2026-04-25 10:08:41 +02:00
mapper: close stale map channels after send timeouts
When the batcher timed out sending to a node, it removed the channel from multiChannelNodeConn but left the old serveLongPoll goroutine running on that channel. That left a live stale session behind: it no longer received new updates, but it could still keep the stream open and block shutdown. Close the pruned channel when stale-send cleanup removes it so the old map session exits after draining any buffered update.
This commit is contained in:
188
hscontrol/poll_test.go
Normal file
188
hscontrol/poll_test.go
Normal file
@@ -0,0 +1,188 @@
|
||||
package hscontrol
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/juanfont/headscale/hscontrol/mapper"
|
||||
"github.com/juanfont/headscale/hscontrol/state"
|
||||
"github.com/juanfont/headscale/hscontrol/types/change"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"tailscale.com/tailcfg"
|
||||
)
|
||||
|
||||
type delayedSuccessResponseWriter struct {
|
||||
header http.Header
|
||||
|
||||
firstWriteDelay time.Duration
|
||||
|
||||
firstWriteStarted chan struct{}
|
||||
firstWriteStartedOnce sync.Once
|
||||
|
||||
firstWriteFinished chan struct{}
|
||||
firstWriteFinishedOnce sync.Once
|
||||
|
||||
mu sync.Mutex
|
||||
writeCount int
|
||||
}
|
||||
|
||||
func newDelayedSuccessResponseWriter(firstWriteDelay time.Duration) *delayedSuccessResponseWriter {
|
||||
return &delayedSuccessResponseWriter{
|
||||
header: make(http.Header),
|
||||
firstWriteDelay: firstWriteDelay,
|
||||
firstWriteStarted: make(chan struct{}),
|
||||
firstWriteFinished: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
func (w *delayedSuccessResponseWriter) Header() http.Header {
|
||||
return w.header
|
||||
}
|
||||
|
||||
func (w *delayedSuccessResponseWriter) WriteHeader(int) {}
|
||||
|
||||
func (w *delayedSuccessResponseWriter) Write(data []byte) (int, error) {
|
||||
w.mu.Lock()
|
||||
w.writeCount++
|
||||
writeCount := w.writeCount
|
||||
w.mu.Unlock()
|
||||
|
||||
if writeCount == 1 {
|
||||
// Only the first write is delayed. This simulates a transiently wedged map response:
|
||||
// long enough to make the batcher time out future sends,
|
||||
// but short enough that the old session can still recover if we leave it alive
|
||||
w.firstWriteStartedOnce.Do(func() {
|
||||
close(w.firstWriteStarted)
|
||||
})
|
||||
|
||||
time.Sleep(w.firstWriteDelay)
|
||||
|
||||
w.firstWriteFinishedOnce.Do(func() {
|
||||
close(w.firstWriteFinished)
|
||||
})
|
||||
}
|
||||
|
||||
return len(data), nil
|
||||
}
|
||||
|
||||
func (w *delayedSuccessResponseWriter) Flush() {}
|
||||
|
||||
func (w *delayedSuccessResponseWriter) FirstWriteStarted() <-chan struct{} {
|
||||
return w.firstWriteStarted
|
||||
}
|
||||
|
||||
func (w *delayedSuccessResponseWriter) FirstWriteFinished() <-chan struct{} {
|
||||
return w.firstWriteFinished
|
||||
}
|
||||
|
||||
func (w *delayedSuccessResponseWriter) WriteCount() int {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
|
||||
return w.writeCount
|
||||
}
|
||||
|
||||
// Reproducer outline:
|
||||
// 1. Start a real long-poll session for one node.
|
||||
// 2. Make the first map write block briefly, so the session stops draining m.ch.
|
||||
// 3. While that write is blocked, queue enough updates to fill the buffered
|
||||
// session channel and make the next batcher send hit the stale-send timeout.
|
||||
// 4. Let the blocked write recover. The stale session should still flush the
|
||||
// update that was already buffered before its channel was pruned.
|
||||
// 5. After that buffered update is drained, the stale session must exit instead
|
||||
// of lingering as an orphaned serveLongPoll goroutine.
|
||||
func TestTransientlyBlockedWriteDoesNotLeaveLiveStaleSession(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
app := createTestApp(t)
|
||||
user := app.state.CreateUserForTest("poll-stale-session-user")
|
||||
createdNode := app.state.CreateRegisteredNodeForTest(user, "poll-stale-session-node")
|
||||
require.NoError(t, app.state.UpdatePolicyManagerUsersForTest())
|
||||
|
||||
app.cfg.Tuning.BatchChangeDelay = 20 * time.Millisecond
|
||||
app.cfg.Tuning.NodeMapSessionBufferedChanSize = 1
|
||||
|
||||
app.mapBatcher.Close()
|
||||
require.NoError(t, app.state.Close())
|
||||
|
||||
reloadedState, err := state.NewState(app.cfg)
|
||||
require.NoError(t, err)
|
||||
app.state = reloadedState
|
||||
|
||||
app.mapBatcher = mapper.NewBatcherAndMapper(app.cfg, app.state)
|
||||
app.mapBatcher.Start()
|
||||
|
||||
t.Cleanup(func() {
|
||||
app.mapBatcher.Close()
|
||||
require.NoError(t, app.state.Close())
|
||||
})
|
||||
|
||||
nodeView, ok := app.state.GetNodeByID(createdNode.ID)
|
||||
require.True(t, ok, "expected node to be present in NodeStore after reload")
|
||||
require.True(t, nodeView.Valid(), "expected valid node view after reload")
|
||||
node := nodeView.AsStruct()
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
writer := newDelayedSuccessResponseWriter(250 * time.Millisecond)
|
||||
session := app.newMapSession(ctx, tailcfg.MapRequest{
|
||||
Stream: true,
|
||||
Version: tailcfg.CapabilityVersion(100),
|
||||
}, writer, node)
|
||||
|
||||
serveDone := make(chan struct{})
|
||||
go func() {
|
||||
session.serveLongPoll()
|
||||
close(serveDone)
|
||||
}()
|
||||
|
||||
t.Cleanup(func() {
|
||||
dummyCh := make(chan *tailcfg.MapResponse, 1)
|
||||
_ = app.mapBatcher.AddNode(node.ID, dummyCh, tailcfg.CapabilityVersion(100))
|
||||
cancel()
|
||||
select {
|
||||
case <-serveDone:
|
||||
case <-time.After(2 * time.Second):
|
||||
}
|
||||
_ = app.mapBatcher.RemoveNode(node.ID, dummyCh)
|
||||
})
|
||||
|
||||
select {
|
||||
case <-writer.FirstWriteStarted():
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("expected initial map write to start")
|
||||
}
|
||||
|
||||
streamsClosed := make(chan struct{})
|
||||
go func() {
|
||||
app.clientStreamsOpen.Wait()
|
||||
close(streamsClosed)
|
||||
}()
|
||||
|
||||
// One update fills the buffered session channel while the first write is blocked.
|
||||
// The second update then hits the 50ms stale-send timeout and the batcher prunes
|
||||
// and closes that stale channel.
|
||||
app.mapBatcher.AddWork(change.SelfUpdate(node.ID), change.SelfUpdate(node.ID))
|
||||
|
||||
select {
|
||||
case <-writer.FirstWriteFinished():
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("expected the blocked write to eventually complete")
|
||||
}
|
||||
|
||||
assert.Eventually(t, func() bool {
|
||||
return writer.WriteCount() >= 2
|
||||
}, 2*time.Second, 20*time.Millisecond, "session should flush the update that was already buffered before the stale send")
|
||||
|
||||
assert.Eventually(t, func() bool {
|
||||
select {
|
||||
case <-streamsClosed:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}, time.Second, 20*time.Millisecond, "after stale-send cleanup, the stale session should exit")
|
||||
}
|
||||
Reference in New Issue
Block a user