From 0bcfdc29ada99b491cff641b386b28bff8bb4709 Mon Sep 17 00:00:00 2001
From: Kristoffer Dalby <kristoffer@dalby.cc>
Date: Fri, 9 Jan 2026 11:19:11 +0000
Subject: [PATCH] cmd/hi: enable concurrent test execution

Remove the concurrent test prevention logic and update cleanup to use
run ID-based isolation, allowing multiple tests to run simultaneously.

Changes:
- cleanup: Add killTestContainersByRunID() to clean only containers
  belonging to a specific run, add cleanupStaleTestContainers() to
  remove only stopped/exited containers without affecting running tests
- docker: Remove RunningTestInfo, checkForRunningTests(), and related
  error types, update cleanupAfterTest() to use run ID-based cleanup
- run: Remove Force flag and concurrent test prevention check

The test runner now:
- Allows multiple concurrent test runs on the same Docker daemon
- Cleans only stale containers before tests (not running ones)
- Cleans only containers with matching run ID after tests
- Prints run ID and monitoring info for operator visibility
---
 cmd/hi/cleanup.go | 120 ++++++++++++++++++++++++++++++++++++++++++++--
 cmd/hi/docker.go  |  95 ++++--------------------------------
 cmd/hi/run.go     |  73 +---------------------------
 3 files changed, 124 insertions(+), 164 deletions(-)

diff --git a/cmd/hi/cleanup.go b/cmd/hi/cleanup.go
index e8d7b926..7c5b5214 100644
--- a/cmd/hi/cleanup.go
+++ b/cmd/hi/cleanup.go
@@ -18,9 +18,11 @@ import (
 )
 
 // cleanupBeforeTest performs cleanup operations before running tests.
+// Only removes stale (stopped/exited) test containers to avoid interfering with concurrent test runs.
 func cleanupBeforeTest(ctx context.Context) error {
-	if err := killTestContainers(ctx); err != nil {
-		return fmt.Errorf("failed to kill test containers: %w", err)
+	err := cleanupStaleTestContainers(ctx)
+	if err != nil {
+		return fmt.Errorf("failed to clean stale test containers: %w", err)
 	}
 
 	if err := pruneDockerNetworks(ctx); err != nil {
@@ -30,11 +32,25 @@ func cleanupBeforeTest(ctx context.Context) error {
 	return nil
 }
 
-// cleanupAfterTest removes the test container after completion.
-func cleanupAfterTest(ctx context.Context, cli *client.Client, containerID string) error {
-	return cli.ContainerRemove(ctx, containerID, container.RemoveOptions{
+// cleanupAfterTest removes the test container and all associated integration test containers for the run.
+func cleanupAfterTest(ctx context.Context, cli *client.Client, containerID, runID string) error {
+	// Remove the main test container
+	err := cli.ContainerRemove(ctx, containerID, container.RemoveOptions{
 		Force: true,
 	})
+	if err != nil {
+		return fmt.Errorf("failed to remove test container: %w", err)
+	}
+
+	// Clean up integration test containers for this run only
+	if runID != "" {
+		err := killTestContainersByRunID(ctx, runID)
+		if err != nil {
+			return fmt.Errorf("failed to clean up containers for run %s: %w", runID, err)
+		}
+	}
+
+	return nil
 }
 
 // killTestContainers terminates and removes all test containers.
@@ -87,6 +103,100 @@ func killTestContainers(ctx context.Context) error {
 	return nil
 }
 
+// killTestContainersByRunID terminates and removes all test containers for a specific run ID.
+// This function filters containers by the hi.run-id label to only affect containers
+// belonging to the specified test run, leaving other concurrent test runs untouched.
+func killTestContainersByRunID(ctx context.Context, runID string) error {
+	cli, err := createDockerClient()
+	if err != nil {
+		return fmt.Errorf("failed to create Docker client: %w", err)
+	}
+	defer cli.Close()
+
+	// Filter containers by hi.run-id label
+	containers, err := cli.ContainerList(ctx, container.ListOptions{
+		All: true,
+		Filters: filters.NewArgs(
+			filters.Arg("label", "hi.run-id="+runID),
+		),
+	})
+	if err != nil {
+		return fmt.Errorf("failed to list containers for run %s: %w", runID, err)
+	}
+
+	removed := 0
+
+	for _, cont := range containers {
+		// Kill the container if it's running
+		if cont.State == "running" {
+			_ = cli.ContainerKill(ctx, cont.ID, "KILL")
+		}
+
+		// Remove the container with retry logic
+		if removeContainerWithRetry(ctx, cli, cont.ID) {
+			removed++
+		}
+	}
+
+	if removed > 0 {
+		fmt.Printf("Removed %d containers for run ID %s\n", removed, runID)
+	}
+
+	return nil
+}
+
+// cleanupStaleTestContainers removes stopped/exited test containers without affecting running tests.
+// This is useful for cleaning up leftover containers from previous crashed or interrupted test runs
+// without interfering with currently running concurrent tests.
+func cleanupStaleTestContainers(ctx context.Context) error {
+	cli, err := createDockerClient()
+	if err != nil {
+		return fmt.Errorf("failed to create Docker client: %w", err)
+	}
+	defer cli.Close()
+
+	// Only get stopped/exited containers
+	containers, err := cli.ContainerList(ctx, container.ListOptions{
+		All: true,
+		Filters: filters.NewArgs(
+			filters.Arg("status", "exited"),
+			filters.Arg("status", "dead"),
+		),
+	})
+	if err != nil {
+		return fmt.Errorf("failed to list stopped containers: %w", err)
+	}
+
+	removed := 0
+
+	for _, cont := range containers {
+		// Only remove containers that look like test containers
+		shouldRemove := false
+
+		for _, name := range cont.Names {
+			if strings.Contains(name, "headscale-test-suite") ||
+				strings.Contains(name, "hs-") ||
+				strings.Contains(name, "ts-") ||
+				strings.Contains(name, "derp-") {
+				shouldRemove = true
+				break
+			}
+		}
+
+		if shouldRemove {
+			if removeContainerWithRetry(ctx, cli, cont.ID) {
+				removed++
+			}
+		}
+	}
+
+	if removed > 0 {
+		fmt.Printf("Removed %d stale test containers\n", removed)
+	}
+
+	return nil
+}
+
 const (
 	containerRemoveInitialInterval = 100 * time.Millisecond
 	containerRemoveMaxElapsedTime  = 2 * time.Second
diff --git a/cmd/hi/docker.go b/cmd/hi/docker.go
index 65250e65..a6b94b25 100644
--- a/cmd/hi/docker.go
+++ b/cmd/hi/docker.go
@@ -26,93 +26,8 @@ var (
 	ErrTestFailed              = errors.New("test failed")
 	ErrUnexpectedContainerWait = errors.New("unexpected end of container wait")
 	ErrNoDockerContext         = errors.New("no docker context found")
-	ErrAnotherRunInProgress    = errors.New("another integration test run is already in progress")
 )
 
-// RunningTestInfo contains information about a currently running integration test.
-type RunningTestInfo struct {
-	RunID         string
-	ContainerID   string
-	ContainerName string
-	StartTime     time.Time
-	Duration      time.Duration
-	TestPattern   string
-}
-
-// ErrNoRunningTests indicates that no integration test is currently running.
-var ErrNoRunningTests = errors.New("no running tests found")
-
-// checkForRunningTests checks if there's already an integration test running.
-// Returns ErrNoRunningTests if no test is running, or RunningTestInfo with details about the running test.
-func checkForRunningTests(ctx context.Context) (*RunningTestInfo, error) {
-	cli, err := createDockerClient()
-	if err != nil {
-		return nil, fmt.Errorf("failed to create Docker client: %w", err)
-	}
-	defer cli.Close()
-
-	// List all running containers
-	containers, err := cli.ContainerList(ctx, container.ListOptions{
-		All: false, // Only running containers
-	})
-	if err != nil {
-		return nil, fmt.Errorf("failed to list containers: %w", err)
-	}
-
-	// Look for containers with hi.test-type=test-runner label
-	for _, cont := range containers {
-		if cont.Labels != nil && cont.Labels["hi.test-type"] == "test-runner" {
-			// Found a running test runner container
-			runID := cont.Labels["hi.run-id"]
-
-			containerName := ""
-			for _, name := range cont.Names {
-				containerName = strings.TrimPrefix(name, "/")
-
-				break
-			}
-
-			// Get more details via inspection
-			inspect, err := cli.ContainerInspect(ctx, cont.ID)
-			if err != nil {
-				// Return basic info if inspection fails
-				return &RunningTestInfo{
-					RunID:         runID,
-					ContainerID:   cont.ID,
-					ContainerName: containerName,
-				}, nil
-			}
-
-			startTime, _ := time.Parse(time.RFC3339Nano, inspect.State.StartedAt)
-			duration := time.Since(startTime)
-
-			// Try to extract test pattern from command
-			testPattern := ""
-
-			if len(inspect.Config.Cmd) > 0 {
-				for i, arg := range inspect.Config.Cmd {
-					if arg == "-run" && i+1 < len(inspect.Config.Cmd) {
-						testPattern = inspect.Config.Cmd[i+1]
-
-						break
-					}
-				}
-			}
-
-			return &RunningTestInfo{
-				RunID:         runID,
-				ContainerID:   cont.ID,
-				ContainerName: containerName,
-				StartTime:     startTime,
-				Duration:      duration,
-				TestPattern:   testPattern,
-			}, nil
-		}
-	}
-
-	return nil, ErrNoRunningTests
-}
-
 // runTestContainer executes integration tests in a Docker container.
 func runTestContainer(ctx context.Context, config *RunConfig) error {
 	cli, err := createDockerClient()
@@ -174,6 +89,9 @@ func runTestContainer(ctx context.Context, config *RunConfig) error {
 	}
 
 	log.Printf("Starting test: %s", config.TestPattern)
+	log.Printf("Run ID: %s", runID)
+	log.Printf("Monitor with: docker logs -f %s", containerName)
+	log.Printf("Logs directory: %s", logsDir)
 
 	// Start stats collection for container resource monitoring (if enabled)
 	var statsCollector *StatsCollector
@@ -234,9 +152,12 @@ func runTestContainer(ctx context.Context, config *RunConfig) error {
 	shouldCleanup := config.CleanAfter && (!config.KeepOnFailure || exitCode == 0)
 	if shouldCleanup {
 		if config.Verbose {
-			log.Printf("Running post-test cleanup...")
+			log.Printf("Running post-test cleanup for run %s...", runID)
 		}
-		if cleanErr := cleanupAfterTest(ctx, cli, resp.ID); cleanErr != nil && config.Verbose {
+
+		cleanErr := cleanupAfterTest(ctx, cli, resp.ID, runID)
+
+		if cleanErr != nil && config.Verbose {
 			log.Printf("Warning: post-test cleanup failed: %v", cleanErr)
 		}
 
diff --git a/cmd/hi/run.go b/cmd/hi/run.go
index 3456b668..1694399d 100644
--- a/cmd/hi/run.go
+++ b/cmd/hi/run.go
@@ -6,7 +6,6 @@ import (
 	"log"
 	"os"
 	"path/filepath"
-	"strings"
 	"time"
 
 	"github.com/creachadair/command"
@@ -14,65 +13,13 @@ import (
 
 var ErrTestPatternRequired = errors.New("test pattern is required as first argument or use --test flag")
 
-// formatRunningTestError creates a detailed error message about a running test.
-func formatRunningTestError(info *RunningTestInfo) error {
-	var msg strings.Builder
-	msg.WriteString("\n")
-	msg.WriteString("╔══════════════════════════════════════════════════════════════════╗\n")
-	msg.WriteString("║  Another integration test run is already in progress!            ║\n")
-	msg.WriteString("╚══════════════════════════════════════════════════════════════════╝\n")
-	msg.WriteString("\n")
-	msg.WriteString("Running test details:\n")
-	msg.WriteString(fmt.Sprintf("  Run ID:      %s\n", info.RunID))
-	msg.WriteString(fmt.Sprintf("  Container:   %s\n", info.ContainerName))
-
-	if info.TestPattern != "" {
-		msg.WriteString(fmt.Sprintf("  Test:        %s\n", info.TestPattern))
-	}
-
-	if !info.StartTime.IsZero() {
-		msg.WriteString(fmt.Sprintf("  Started:     %s\n", info.StartTime.Format("2006-01-02 15:04:05")))
-		msg.WriteString(fmt.Sprintf("  Running for: %s\n", formatDuration(info.Duration)))
-	}
-
-	msg.WriteString("\n")
-	msg.WriteString("Please wait for the current test to complete, or stop it with:\n")
-	msg.WriteString("  go run ./cmd/hi clean containers\n")
-	msg.WriteString("\n")
-	msg.WriteString("To monitor the running test:\n")
-	msg.WriteString(fmt.Sprintf("  docker logs -f %s\n", info.ContainerName))
-
-	return fmt.Errorf("%w\n%s", ErrAnotherRunInProgress, msg.String())
-}
-
-const secondsPerMinute = 60
-
-// formatDuration formats a duration in a human-readable way.
-func formatDuration(d time.Duration) string {
-	if d < time.Minute {
-		return fmt.Sprintf("%d seconds", int(d.Seconds()))
-	}
-
-	if d < time.Hour {
-		minutes := int(d.Minutes())
-		seconds := int(d.Seconds()) % secondsPerMinute
-
-		return fmt.Sprintf("%d minutes, %d seconds", minutes, seconds)
-	}
-
-	hours := int(d.Hours())
-	minutes := int(d.Minutes()) % secondsPerMinute
-
-	return fmt.Sprintf("%d hours, %d minutes", hours, minutes)
-}
-
 type RunConfig struct {
 	TestPattern   string        `flag:"test,Test pattern to run"`
 	Timeout       time.Duration `flag:"timeout,default=120m,Test timeout"`
 	FailFast      bool          `flag:"failfast,default=true,Stop on first test failure"`
 	UsePostgres   bool          `flag:"postgres,default=false,Use PostgreSQL instead of SQLite"`
 	GoVersion     string        `flag:"go-version,Go version to use (auto-detected from go.mod)"`
-	CleanBefore   bool          `flag:"clean-before,default=true,Clean resources before test"`
+	CleanBefore   bool          `flag:"clean-before,default=true,Clean stale resources before test"`
 	CleanAfter    bool          `flag:"clean-after,default=true,Clean resources after test"`
 	KeepOnFailure bool          `flag:"keep-on-failure,default=false,Keep containers on test failure"`
 	LogsDir       string        `flag:"logs-dir,default=control_logs,Control logs directory"`
@@ -80,7 +27,6 @@ type RunConfig struct {
 	Stats         bool          `flag:"stats,default=false,Collect and display container resource usage statistics"`
 	HSMemoryLimit float64       `flag:"hs-memory-limit,default=0,Fail test if any Headscale container exceeds this memory limit in MB (0 = disabled)"`
 	TSMemoryLimit float64       `flag:"ts-memory-limit,default=0,Fail test if any Tailscale container exceeds this memory limit in MB (0 = disabled)"`
-	Force         bool          `flag:"force,default=false,Kill any running test and start a new one"`
 }
 
 // runIntegrationTest executes the integration test workflow.
@@ -98,23 +44,6 @@ func runIntegrationTest(env *command.Env) error {
 		runConfig.GoVersion = detectGoVersion()
 	}
 
-	// Check if another test run is already in progress
-	runningTest, err := checkForRunningTests(env.Context())
-	if err != nil && !errors.Is(err, ErrNoRunningTests) {
-		log.Printf("Warning: failed to check for running tests: %v", err)
-	} else if runningTest != nil {
-		if runConfig.Force {
-			log.Printf("Force flag set, killing existing test run: %s", runningTest.RunID)
-
-			err = killTestContainers(env.Context())
-			if err != nil {
-				return fmt.Errorf("failed to kill existing test containers: %w", err)
-			}
-		} else {
-			return formatRunningTestError(runningTest)
-		}
-	}
-
 	// Run pre-flight checks
 	if runConfig.Verbose {
 		log.Printf("Running pre-flight system checks...")