@@ -80,8 +80,9 @@ type HTTPTransport struct {
8080 shutdownCh chan struct {}
8181
8282 // Container monitor
83- monitor rt.Monitor
84- errorCh <- chan error
83+ monitor rt.Monitor
84+ monitorRuntime rt.Runtime // Stored for monitor reconnection on container restart
85+ errorCh <- chan error
8586
8687 // Container exit error (for determining if restart is needed)
8788 containerExitErr error
@@ -329,6 +330,7 @@ func (t *HTTPTransport) Start(ctx context.Context) error {
329330 if err != nil {
330331 return fmt .Errorf ("failed to create container monitor: %w" , err )
331332 }
333+ t .monitorRuntime = monitorRuntime // Store for reconnection
332334 t .monitor = container .NewMonitor (monitorRuntime , t .containerName )
333335
334336 // Start monitoring the container
@@ -348,8 +350,15 @@ func (t *HTTPTransport) Stop(ctx context.Context) error {
348350 t .mutex .Lock ()
349351 defer t .mutex .Unlock ()
350352
351- // Signal shutdown
352- close (t .shutdownCh )
353+ // Signal shutdown (guard against double-close if Stop is called
354+ // both from handleContainerExit and externally by the runner)
355+ select {
356+ case <- t .shutdownCh :
357+ // Already closed/stopping
358+ return nil
359+ default :
360+ close (t .shutdownCh )
361+ }
353362
354363 // For remote MCP servers, we don't need container monitoring
355364 if t .remoteURL == "" {
@@ -378,39 +387,89 @@ func (t *HTTPTransport) Stop(ctx context.Context) error {
378387}
379388
380389// handleContainerExit handles container exit events.
390+ // It loops to support reconnecting the monitor when a container is restarted
391+ // by Docker (e.g., via restart policy) rather than truly exiting.
381392func (t * HTTPTransport ) handleContainerExit (ctx context.Context ) {
382- select {
383- case <- ctx .Done ():
384- return
385- case err := <- t .errorCh :
386- // Store the exit error so runner can check if restart is needed
387- t .exitErrMutex .Lock ()
388- t .containerExitErr = err
389- t .exitErrMutex .Unlock ()
390-
391- //nolint:gosec // G706: logging container name from config
392- slog .Warn ("container exited" , "container" , t .containerName , "error" , err )
393+ for {
394+ select {
395+ case <- ctx .Done ():
396+ return
397+ case <- t .shutdownCh :
398+ return
399+ case err := <- t .errorCh :
400+ t .exitErrMutex .Lock ()
401+ t .containerExitErr = err
402+ t .exitErrMutex .Unlock ()
403+
404+ if errors .Is (err , rt .ErrContainerRestarted ) {
405+ //nolint:gosec // G706: logging container name from config
406+ slog .Debug ("container was restarted by Docker, reconnecting monitor" ,
407+ "container" , t .containerName )
408+ if reconnectErr := t .reconnectMonitor (ctx ); reconnectErr != nil {
409+ //nolint:gosec // G706: logging container name from config
410+ slog .Error ("failed to reconnect monitor, stopping transport" ,
411+ "container" , t .containerName , "error" , reconnectErr )
412+ } else {
413+ t .exitErrMutex .Lock ()
414+ t .containerExitErr = nil
415+ t .exitErrMutex .Unlock ()
416+ continue
417+ }
418+ }
393419
394- // Check if container was removed (not just exited) using typed error
395- if errors .Is (err , rt .ErrContainerRemoved ) {
396- //nolint:gosec // G706: logging container name from config
397- slog .Debug ("container was removed, stopping proxy and cleaning up" ,
398- "container" , t .containerName )
399- } else {
400420 //nolint:gosec // G706: logging container name from config
401- slog .Debug ("container exited, will attempt automatic restart" ,
402- "container" , t .containerName )
403- }
421+ slog .Warn ("container exited" , "container" , t .containerName , "error" , err )
422+
423+ // Check if container was removed (not just exited) using typed error
424+ if errors .Is (err , rt .ErrContainerRemoved ) {
425+ //nolint:gosec // G706: logging container name from config
426+ slog .Debug ("container was removed, stopping proxy and cleaning up" ,
427+ "container" , t .containerName )
428+ } else {
429+ //nolint:gosec // G706: logging container name from config
430+ slog .Debug ("container exited, will attempt automatic restart" ,
431+ "container" , t .containerName )
432+ }
404433
405- // Stop the transport when the container exits/removed
406- if stopErr := t .Stop (ctx ); stopErr != nil {
407- slog .Error ("error stopping transport after container exit" , "error" , stopErr )
434+ // Stop the transport when the container exits/removed
435+ if stopErr := t .Stop (ctx ); stopErr != nil {
436+ slog .Error ("error stopping transport after container exit" , "error" , stopErr )
437+ }
438+ return
408439 }
409440 }
410441}
411442
443+ // reconnectMonitor stops the current monitor and starts a new one.
444+ // This is used when a container is restarted by Docker -- the proxy keeps running
445+ // but the monitor needs to track the new container start time.
446+ func (t * HTTPTransport ) reconnectMonitor (ctx context.Context ) error {
447+ t .mutex .Lock ()
448+ defer t .mutex .Unlock ()
449+
450+ // Stop the old monitor (safe even if goroutine already returned)
451+ if t .monitor != nil {
452+ t .monitor .StopMonitoring ()
453+ }
454+
455+ // Create a new monitor that records the current (post-restart) start time
456+ t .monitor = container .NewMonitor (t .monitorRuntime , t .containerName )
457+
458+ // Start monitoring — errorCh is reassigned here, which is safe because
459+ // handleContainerExit (the only reader) runs on the same goroutine and
460+ // will see the new channel when it re-enters the select after continue.
461+ var err error
462+ t .errorCh , err = t .monitor .StartMonitoring (ctx )
463+ if err != nil {
464+ return fmt .Errorf ("failed to restart container monitoring: %w" , err )
465+ }
466+
467+ return nil
468+ }
469+
412470// ShouldRestart returns true if the container exited and should be restarted.
413- // Returns false if the container was removed (intentionally deleted).
471+ // Returns false if the container was removed (intentionally deleted) or
472+ // restarted by Docker (already running, no ToolHive restart needed).
414473func (t * HTTPTransport ) ShouldRestart () bool {
415474 t .exitErrMutex .Lock ()
416475 defer t .exitErrMutex .Unlock ()
@@ -419,8 +478,9 @@ func (t *HTTPTransport) ShouldRestart() bool {
419478 return false // No exit error, normal shutdown
420479 }
421480
422- // Don't restart if container was removed (use typed error check)
423- return ! errors .Is (t .containerExitErr , rt .ErrContainerRemoved )
481+ // Don't restart if container was removed or restarted by Docker (use typed error check)
482+ return ! errors .Is (t .containerExitErr , rt .ErrContainerRemoved ) &&
483+ ! errors .Is (t .containerExitErr , rt .ErrContainerRestarted )
424484}
425485
426486// IsRunning checks if the transport is currently running.
0 commit comments