mapper: close stale map channels after send timeouts

When the batcher timed out sending to a node, it removed the channel from multiChannelNodeConn but left the old serveLongPoll goroutine running on that channel. That left a live stale session behind: it no longer received new updates, but it could still keep the stream open and block shutdown. Close the pruned channel when stale-send cleanup removes it so the old map session exits after draining any buffered update.
2026-04-25 10:08:41 +02:00 · 2026-03-07 22:09:33 +03:00
parent b81d6c734d
commit 3daf45e88a
2 changed files with 214 additions and 8 deletions
--- a/hscontrol/poll_test.go
+++ b/hscontrol/poll_test.go
@@ -0,0 +1,188 @@
+package hscontrol
+
+import (
+	"context"
+	"net/http"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/juanfont/headscale/hscontrol/mapper"
+	"github.com/juanfont/headscale/hscontrol/state"
+	"github.com/juanfont/headscale/hscontrol/types/change"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"tailscale.com/tailcfg"
+)
+
+type delayedSuccessResponseWriter struct {
+	header http.Header
+
+	firstWriteDelay time.Duration
+
+	firstWriteStarted     chan struct{}
+	firstWriteStartedOnce sync.Once
+
+	firstWriteFinished     chan struct{}
+	firstWriteFinishedOnce sync.Once
+
+	mu         sync.Mutex
+	writeCount int
+}
+
+func newDelayedSuccessResponseWriter(firstWriteDelay time.Duration) *delayedSuccessResponseWriter {
+	return &delayedSuccessResponseWriter{
+		header:             make(http.Header),
+		firstWriteDelay:    firstWriteDelay,
+		firstWriteStarted:  make(chan struct{}),
+		firstWriteFinished: make(chan struct{}),
+	}
+}
+
+func (w *delayedSuccessResponseWriter) Header() http.Header {
+	return w.header
+}
+
+func (w *delayedSuccessResponseWriter) WriteHeader(int) {}
+
+func (w *delayedSuccessResponseWriter) Write(data []byte) (int, error) {
+	w.mu.Lock()
+	w.writeCount++
+	writeCount := w.writeCount
+	w.mu.Unlock()
+
+	if writeCount == 1 {
+		// Only the first write is delayed. This simulates a transiently wedged map response:
+		// long enough to make the batcher time out future sends,
+		// but short enough that the old session can still recover if we leave it alive
+		w.firstWriteStartedOnce.Do(func() {
+			close(w.firstWriteStarted)
+		})
+
+		time.Sleep(w.firstWriteDelay)
+
+		w.firstWriteFinishedOnce.Do(func() {
+			close(w.firstWriteFinished)
+		})
+	}
+
+	return len(data), nil
+}
+
+func (w *delayedSuccessResponseWriter) Flush() {}
+
+func (w *delayedSuccessResponseWriter) FirstWriteStarted() <-chan struct{} {
+	return w.firstWriteStarted
+}
+
+func (w *delayedSuccessResponseWriter) FirstWriteFinished() <-chan struct{} {
+	return w.firstWriteFinished
+}
+
+func (w *delayedSuccessResponseWriter) WriteCount() int {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	return w.writeCount
+}
+
+// Reproducer outline:
+//  1. Start a real long-poll session for one node.
+//  2. Make the first map write block briefly, so the session stops draining m.ch.
+//  3. While that write is blocked, queue enough updates to fill the buffered
+//     session channel and make the next batcher send hit the stale-send timeout.
+//  4. Let the blocked write recover. The stale session should still flush the
+//     update that was already buffered before its channel was pruned.
+//  5. After that buffered update is drained, the stale session must exit instead
+//     of lingering as an orphaned serveLongPoll goroutine.
+func TestTransientlyBlockedWriteDoesNotLeaveLiveStaleSession(t *testing.T) {
+	t.Parallel()
+
+	app := createTestApp(t)
+	user := app.state.CreateUserForTest("poll-stale-session-user")
+	createdNode := app.state.CreateRegisteredNodeForTest(user, "poll-stale-session-node")
+	require.NoError(t, app.state.UpdatePolicyManagerUsersForTest())
+
+	app.cfg.Tuning.BatchChangeDelay = 20 * time.Millisecond
+	app.cfg.Tuning.NodeMapSessionBufferedChanSize = 1
+
+	app.mapBatcher.Close()
+	require.NoError(t, app.state.Close())
+
+	reloadedState, err := state.NewState(app.cfg)
+	require.NoError(t, err)
+	app.state = reloadedState
+
+	app.mapBatcher = mapper.NewBatcherAndMapper(app.cfg, app.state)
+	app.mapBatcher.Start()
+
+	t.Cleanup(func() {
+		app.mapBatcher.Close()
+		require.NoError(t, app.state.Close())
+	})
+
+	nodeView, ok := app.state.GetNodeByID(createdNode.ID)
+	require.True(t, ok, "expected node to be present in NodeStore after reload")
+	require.True(t, nodeView.Valid(), "expected valid node view after reload")
+	node := nodeView.AsStruct()
+
+	ctx, cancel := context.WithCancel(context.Background())
+	writer := newDelayedSuccessResponseWriter(250 * time.Millisecond)
+	session := app.newMapSession(ctx, tailcfg.MapRequest{
+		Stream:  true,
+		Version: tailcfg.CapabilityVersion(100),
+	}, writer, node)
+
+	serveDone := make(chan struct{})
+	go func() {
+		session.serveLongPoll()
+		close(serveDone)
+	}()
+
+	t.Cleanup(func() {
+		dummyCh := make(chan *tailcfg.MapResponse, 1)
+		_ = app.mapBatcher.AddNode(node.ID, dummyCh, tailcfg.CapabilityVersion(100))
+		cancel()
+		select {
+		case <-serveDone:
+		case <-time.After(2 * time.Second):
+		}
+		_ = app.mapBatcher.RemoveNode(node.ID, dummyCh)
+	})
+
+	select {
+	case <-writer.FirstWriteStarted():
+	case <-time.After(2 * time.Second):
+		t.Fatal("expected initial map write to start")
+	}
+
+	streamsClosed := make(chan struct{})
+	go func() {
+		app.clientStreamsOpen.Wait()
+		close(streamsClosed)
+	}()
+
+	// One update fills the buffered session channel while the first write is blocked.
+	// The second update then hits the 50ms stale-send timeout and the batcher prunes
+	// and closes that stale channel.
+	app.mapBatcher.AddWork(change.SelfUpdate(node.ID), change.SelfUpdate(node.ID))
+
+	select {
+	case <-writer.FirstWriteFinished():
+	case <-time.After(2 * time.Second):
+		t.Fatal("expected the blocked write to eventually complete")
+	}
+
+	assert.Eventually(t, func() bool {
+		return writer.WriteCount() >= 2
+	}, 2*time.Second, 20*time.Millisecond, "session should flush the update that was already buffered before the stale send")
+
+	assert.Eventually(t, func() bool {
+		select {
+		case <-streamsClosed:
+			return true
+		default:
+			return false
+		}
+	}, time.Second, 20*time.Millisecond, "after stale-send cleanup, the stale session should exit")
+}