Gracefully Shutdown Foreground Server on Interrupt (#2927)

jerm-dro · web-flow · commit 92922c69b8f0 · 2025-12-08T10:25:51.000-08:00
Summary This pull request is a second attempt at #2863: Gracefully shutting down foreground servers when receiving a SIGINT. This PR takes greater care to fix tests which became flaky with the original change. Fixes #1855. Fixed Tests and Their Changes osv_mcp_server_test.go This file has a test that became flaky, specifically the Running OSV MCP server in the foreground" tests. This is the only e2e test that exercises the foreground server like a user would. I made the following changes to the test/code to ensure it passes: Update the test to document how SIGINT should behave rather than using thv stop/rm. Simplify the handling of context cancellation in runForeground. Previously, we had two goroutines doing parallel shutdown work: workloadManager.RunWorkload and runForeground. runForeground calls RunWorkload so it's natural to block within runForeground until RunWorload returns. Previously, the two shutdown routines could race on modifying/deleting the workload. Screenshot 2025-12-07 at 3 32 00 PM fetch_mcp_server_test.go This file and likely others exercise --foreground indirectly. The fetch tests are structured to stop a server with a shared name in between tests. Previously, thv stop sent a SIGINT to the background process thv restart --foreground that was spawned by thv run. When the background process received that signal, the signal handler we removed called os.Exit causing the process to exit and no longer babysit the workload. The problem and solution thv restart was intentionally coded to ignore context cancellation because we wanted a timeout around the restart's initial startup. Because thv restart ignore all context cancellation, it effectively ignored all SIGINTs and the process would continually resuscitate the server with stale state we were attempting to change across tests. The solution preserves the timeout on startup but ensures post-startup context cancellation is respected.
diff --git a/cmd/thv/app/run.go b/cmd/thv/app/run.go
@@ -7,9 +7,7 @@ import (
 	"net"
 	"net/url"
 	"os"
-	"os/signal"
 	"strings"
-	"syscall"
 	"time"
 
 	"github.com/spf13/cobra"
@@ -126,7 +124,7 @@ func init() {
 	AddOIDCFlags(runCmd)
 }
 
-func cleanupAndWait(workloadManager workloads.Manager, name string, cancel context.CancelFunc, errCh <-chan error) {
+func cleanupAndWait(workloadManager workloads.Manager, name string) {
 	cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 30*time.Second)
 	defer cleanupCancel()
 
@@ -138,13 +136,6 @@ func cleanupAndWait(workloadManager workloads.Manager, name string, cancel conte
 			logger.Warnf("DeleteWorkloads group error for %q: %v", name, err)
 		}
 	}
-
-	cancel()
-	select {
-	case <-errCh:
-	case <-time.After(5 * time.Second):
-		logger.Warnf("Timeout waiting for workload to stop")
-	}
 }
 
 // nolint:gocyclo // This function is complex by design
@@ -304,28 +295,26 @@ func getworkloadDefaultName(_ context.Context, serverOrImage string) string {
 }
 
 func runForeground(ctx context.Context, workloadManager workloads.Manager, runnerConfig *runner.RunConfig) error {
-	ctx, cancel := context.WithCancel(ctx)
-	defer cancel()
-
-	sigCh := make(chan os.Signal, 1)
-	signal.Notify(sigCh, os.Interrupt, syscall.SIGTERM)
-	defer signal.Stop(sigCh)
 
 	errCh := make(chan error, 1)
 	go func() {
 		errCh <- workloadManager.RunWorkload(ctx, runnerConfig)
 	}()
 
-	select {
-	case sig := <-sigCh:
-		if !process.IsDetached() {
-			logger.Infof("Received signal: %v, stopping server %q", sig, runnerConfig.BaseName)
-			cleanupAndWait(workloadManager, runnerConfig.BaseName, cancel, errCh)
-		}
-		return nil
-	case err := <-errCh:
-		return err
+	// workloadManager.RunWorkload will block until the context is cancelled
+	// or an unrecoverable error is returned. In either case, it will stop the server.
+	// We wait until workloadManager.RunWorkload exits before deleting the workload,
+	// so stopping and deleting don't race.
+	//
+	// There's room for improvement in the factoring here.
+	// Shutdown and cancellation logic is unnecessarily spread across two goroutines.
+	err := <-errCh
+	if !process.IsDetached() {
+		logger.Infof("RunWorkload Exited. Error: %v, stopping server %q", err, runnerConfig.BaseName)
+		cleanupAndWait(workloadManager, runnerConfig.BaseName)
 	}
+	return err
+
 }
 
 func validateGroup(ctx context.Context, workloadsManager workloads.Manager, serverOrImage string) error {
diff --git a/cmd/thv/main.go b/cmd/thv/main.go
@@ -2,6 +2,7 @@
 package main
 
 import (
+	"context"
 	"os"
 	"os/signal"
 	"syscall"
@@ -12,7 +13,6 @@ import (
 	"github.com/stacklok/toolhive/cmd/thv/app"
 	"github.com/stacklok/toolhive/pkg/client"
 	"github.com/stacklok/toolhive/pkg/container"
-	"github.com/stacklok/toolhive/pkg/container/runtime"
 	"github.com/stacklok/toolhive/pkg/lockfile"
 	"github.com/stacklok/toolhive/pkg/logger"
 	"github.com/stacklok/toolhive/pkg/migration"
@@ -23,7 +23,7 @@ func main() {
 	logger.Initialize()
 
 	// Setup signal handling for graceful cleanup
-	setupSignalHandler()
+	ctx := setupSignalHandler()
 
 	// Clean up stale lock files on startup
 	cleanupStaleLockFiles()
@@ -47,8 +47,10 @@ func main() {
 		migration.CheckAndPerformDefaultGroupMigration()
 	}
 
+	cmd := app.NewRootCmd(!app.IsCompletionCommand(os.Args))
+
 	// Skip update check for completion command or if we are running in kubernetes
-	if err := app.NewRootCmd(!app.IsCompletionCommand(os.Args) && !runtime.IsKubernetesRuntime()).Execute(); err != nil {
+	if err := cmd.ExecuteContext(ctx); err != nil {
 		// Clean up any remaining lock files on error exit
 		lockfile.CleanupAllLocks()
 		os.Exit(1)
@@ -59,16 +61,19 @@ func main() {
 }
 
 // setupSignalHandler configures signal handling to ensure lock files are cleaned up
-func setupSignalHandler() {
+func setupSignalHandler() context.Context {
 	sigCh := make(chan os.Signal, 1)
 	signal.Notify(sigCh, os.Interrupt, syscall.SIGTERM, syscall.SIGQUIT)
 
+	ctx, cancel := context.WithCancel(context.Background())
 	go func() {
 		<-sigCh
 		logger.Debugf("Received signal, cleaning up lock files...")
 		lockfile.CleanupAllLocks()
-		os.Exit(0)
+		cancel()
 	}()
+
+	return ctx
 }
 
 // cleanupStaleLockFiles removes stale lock files from known directories on startup
diff --git a/pkg/runner/runner.go b/pkg/runner/runner.go
@@ -8,9 +8,7 @@ import (
 	"fmt"
 	"net/http"
 	"os"
-	"os/signal"
 	"strings"
-	"syscall"
 	"time"
 
 	"golang.org/x/oauth2"
@@ -317,16 +315,19 @@ func (r *Runner) Run(ctx context.Context) error {
 
 	// Define a function to stop the MCP server
 	stopMCPServer := func(reason string) {
+		// Use a background context to avoid cancellation of the main context.
+		cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 1*time.Minute)
+		defer cleanupCancel()
 		logger.Infof("Stopping MCP server: %s", reason)
 
 		// Stop the transport (which also stops the container, monitoring, and handles removal)
 		logger.Infof("Stopping %s transport...", r.Config.Transport)
-		if err := transportHandler.Stop(ctx); err != nil {
+		if err := transportHandler.Stop(cleanupCtx); err != nil {
 			logger.Warnf("Warning: Failed to stop transport: %v", err)
 		}
 
 		// Cleanup telemetry provider
-		if err := r.Cleanup(ctx); err != nil {
+		if err := r.Cleanup(cleanupCtx); err != nil {
 			logger.Warnf("Warning: Failed to cleanup telemetry: %v", err)
 		}
 
@@ -335,7 +336,7 @@ func (r *Runner) Run(ctx context.Context) error {
 		if err := process.RemovePIDFile(r.Config.BaseName); err != nil {
 			logger.Warnf("Warning: Failed to remove PID file: %v", err)
 		}
-		if err := r.statusManager.ResetWorkloadPID(ctx, r.Config.BaseName); err != nil {
+		if err := r.statusManager.ResetWorkloadPID(cleanupCtx, r.Config.BaseName); err != nil {
 			logger.Warnf("Warning: Failed to reset workload %s PID: %v", r.Config.ContainerName, err)
 		}
 
@@ -354,10 +355,6 @@ func (r *Runner) Run(ctx context.Context) error {
 		logger.Info("Press Ctrl+C to stop or wait for container to exit")
 	}
 
-	// Set up signal handling
-	sigCh := make(chan os.Signal, 1)
-	signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
-
 	// Create a done channel to signal when the server has been stopped
 	doneCh := make(chan struct{})
 
@@ -399,8 +396,8 @@ func (r *Runner) Run(ctx context.Context) error {
 
 	// Wait for either a signal or the done channel to be closed
 	select {
-	case sig := <-sigCh:
-		stopMCPServer(fmt.Sprintf("Received signal %s", sig))
+	case <-ctx.Done():
+		stopMCPServer("Context cancelled")
 	case <-doneCh:
 		// The transport has already been stopped (likely by the container exit)
 		// Clean up the PID file and state
diff --git a/pkg/workloads/manager.go b/pkg/workloads/manager.go
@@ -872,7 +872,7 @@ func (d *DefaultManager) DeleteWorkloads(_ context.Context, names []string) (*er
 }
 
 // RestartWorkloads restarts the specified workloads by name.
-func (d *DefaultManager) RestartWorkloads(_ context.Context, names []string, foreground bool) (*errgroup.Group, error) {
+func (d *DefaultManager) RestartWorkloads(ctx context.Context, names []string, foreground bool) (*errgroup.Group, error) {
 	// Validate all workload names to prevent path traversal attacks
 	for _, name := range names {
 		if err := types.ValidateWorkloadName(name); err != nil {
@@ -884,7 +884,7 @@ func (d *DefaultManager) RestartWorkloads(_ context.Context, names []string, for
 
 	for _, name := range names {
 		group.Go(func() error {
-			return d.restartSingleWorkload(name, foreground)
+			return d.restartSingleWorkload(ctx, name, foreground)
 		})
 	}
 
@@ -943,39 +943,59 @@ func (d *DefaultManager) updateSingleWorkload(workloadName string, newConfig *ru
 }
 
 // restartSingleWorkload handles the restart logic for a single workload
-func (d *DefaultManager) restartSingleWorkload(name string, foreground bool) error {
-	// Create a child context with a longer timeout
-	childCtx, cancel := context.WithTimeout(context.Background(), AsyncOperationTimeout)
-	defer cancel()
+func (d *DefaultManager) restartSingleWorkload(ctx context.Context, name string, foreground bool) error {
 
 	// First, try to load the run configuration to check if it's a remote workload
-	runConfig, err := runner.LoadState(childCtx, name)
+	runConfig, err := runner.LoadState(ctx, name)
 	if err != nil {
 		// If we can't load the state, it might be a container workload or the workload doesn't exist
 		// Try to restart it as a container workload
-		return d.restartContainerWorkload(childCtx, name, foreground)
+		return d.restartContainerWorkload(ctx, name, foreground)
 	}
 
 	// Check if this is a remote workload
 	if runConfig.RemoteURL != "" {
-		return d.restartRemoteWorkload(childCtx, name, runConfig, foreground)
+		return d.restartRemoteWorkload(ctx, name, runConfig, foreground)
 	}
 
 	// This is a container-based workload
-	return d.restartContainerWorkload(childCtx, name, foreground)
+	return d.restartContainerWorkload(ctx, name, foreground)
 }
 
 // restartRemoteWorkload handles restarting a remote workload
+// It blocks until the context is cancelled or there is already a supervisor process running.
 func (d *DefaultManager) restartRemoteWorkload(
 	ctx context.Context,
 	name string,
 	runConfig *runner.RunConfig,
 	foreground bool,
 ) error {
+	mcpRunner, err := d.maybeSetupRemoteWorkload(ctx, name, runConfig)
+	if err != nil {
+		return fmt.Errorf("failed to setup remote workload: %w", err)
+	}
+
+	if mcpRunner == nil {
+		return nil
+	}
+
+	return d.startWorkload(ctx, name, mcpRunner, foreground)
+}
+
+// maybeSetupRemoteWorkload is the startup steps for a remote workload.
+// A runner may not be returned if the workload is already running and supervised.
+func (d *DefaultManager) maybeSetupRemoteWorkload(
+	ctx context.Context,
+	name string,
+	runConfig *runner.RunConfig,
+) (*runner.Runner, error) {
+	ctx, cancel := context.WithTimeout(ctx, AsyncOperationTimeout)
+	defer cancel()
+
 	// Get workload status using the status manager
 	workload, err := d.statuses.GetWorkload(ctx, name)
 	if err != nil && !errors.Is(err, rt.ErrWorkloadNotFound) {
-		return err
+		return nil, err
 	}
 
 	// If workload is already running, check if the supervisor process is healthy
@@ -986,7 +1006,7 @@ func (d *DefaultManager) restartRemoteWorkload(
 		if supervisorAlive {
 			// Workload is running and healthy - preserve old behavior (no-op)
 			logger.Infof("Remote workload %s is already running", name)
-			return nil
+			return nil, nil
 		}
 
 		// Supervisor is dead/missing - we need to clean up and restart to fix the damaged state
@@ -1015,7 +1035,7 @@ func (d *DefaultManager) restartRemoteWorkload(
 	// Load runner configuration from state
 	mcpRunner, err := d.loadRunnerFromState(ctx, runConfig.BaseName)
 	if err != nil {
-		return fmt.Errorf("failed to load state for %s: %v", runConfig.BaseName, err)
+		return nil, fmt.Errorf("failed to load state for %s: %v", runConfig.BaseName, err)
 	}
 
 	// Set status to starting
@@ -1024,16 +1044,31 @@ func (d *DefaultManager) restartRemoteWorkload(
 	}
 
 	logger.Infof("Loaded configuration from state for %s", runConfig.BaseName)
+	return mcpRunner, nil
+}
+
+// restartContainerWorkload handles restarting a container-based workload.
+// It blocks until the context is cancelled or there is already a supervisor process running.
+func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name string, foreground bool) error {
+	workloadName, mcpRunner, err := d.maybeSetupContainerWorkload(ctx, name)
+	if err != nil {
+		return fmt.Errorf("failed to setup container workload: %w", err)
+	}
+
+	if mcpRunner == nil {
+		return nil
+	}
 
-	// Start the remote workload using the loaded runner
-	// Use background context to avoid timeout cancellation - same reasoning as container workloads
-	return d.startWorkload(context.Background(), name, mcpRunner, foreground)
+	return d.startWorkload(ctx, workloadName, mcpRunner, foreground)
 }
 
-// restartContainerWorkload handles restarting a container-based workload
+// maybeSetupContainerWorkload is the startup steps for a container-based workload.
+// A runner may not be returned if the workload is already running and supervised.
 //
 //nolint:gocyclo // Complexity is justified - handles multiple restart scenarios and edge cases
-func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name string, foreground bool) error {
+func (d *DefaultManager) maybeSetupContainerWorkload(ctx context.Context, name string) (string, *runner.Runner, error) {
+	ctx, cancel := context.WithTimeout(ctx, AsyncOperationTimeout)
+	defer cancel()
 	// Get container info to resolve partial names and extract proper workload name
 	var containerName string
 	var workloadName string
@@ -1057,7 +1092,7 @@ func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name stri
 	// Get workload status using the status manager
 	workload, err := d.statuses.GetWorkload(ctx, name)
 	if err != nil && !errors.Is(err, rt.ErrWorkloadNotFound) {
-		return err
+		return "", nil, err
 	}
 
 	// Check if workload is running and healthy (including supervisor process)
@@ -1068,7 +1103,7 @@ func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name stri
 		if supervisorAlive {
 			// Workload is running and healthy - preserve old behavior (no-op)
 			logger.Infof("Container %s is already running", containerName)
-			return nil
+			return "", nil, nil
 		}
 
 		// Supervisor is dead/missing - we need to clean up and restart to fix the damaged state
@@ -1107,7 +1142,7 @@ func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name stri
 				if statusErr := d.statuses.SetWorkloadStatus(ctx, workloadName, rt.WorkloadStatusError, err.Error()); statusErr != nil {
 					logger.Warnf("Failed to set workload %s status to error: %v", workloadName, statusErr)
 				}
-				return fmt.Errorf("failed to stop container %s: %v", containerName, err)
+				return "", nil, fmt.Errorf("failed to stop container %s: %v", containerName, err)
 			}
 			logger.Infof("Container %s stopped", containerName)
 		}
@@ -1126,7 +1161,7 @@ func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name stri
 	// Load runner configuration from state
 	mcpRunner, err := d.loadRunnerFromState(ctx, workloadName)
 	if err != nil {
-		return fmt.Errorf("failed to load state for %s: %v", workloadName, err)
+		return "", nil, fmt.Errorf("failed to load state for %s: %v", workloadName, err)
 	}
 
 	// Set workload status to starting - use the workload name for status operations
@@ -1135,11 +1170,7 @@ func (d *DefaultManager) restartContainerWorkload(ctx context.Context, name stri
 	}
 	logger.Infof("Loaded configuration from state for %s", workloadName)
 
-	// Start the workload with background context to avoid timeout cancellation
-	// The ctx with AsyncOperationTimeout is only for the restart setup operations,
-	// but the actual workload should run indefinitely with its own lifecycle management
-	// Use workload name for user-facing operations
-	return d.startWorkload(context.Background(), workloadName, mcpRunner, foreground)
+	return workloadName, mcpRunner, nil
 }
 
 // startWorkload starts the workload in either foreground or background mode
diff --git a/test/e2e/helpers.go b/test/e2e/helpers.go
diff --git a/test/e2e/osv_mcp_server_test.go b/test/e2e/osv_mcp_server_test.go