Skip to content

Commit 5fd2eb9

Browse files
authored
Distinguish container restarts from exits in monitor (#3937)
The container monitor sends ErrContainerExited for both true exits and Docker restarts (detected via StartedAt change). The transport layer has no way to tell them apart, so it always tears down the proxy — killing a healthy container that Docker already restarted. Add ErrContainerRestarted sentinel error so the transport can handle restarts differently: reconnect the monitor to track the new start time while keeping the proxy running. Fixes #3936 Signed-off-by: Greg Katz <gkatz@indeed.com>
1 parent 1b87777 commit 5fd2eb9

File tree

7 files changed

+110
-36
lines changed

7 files changed

+110
-36
lines changed

pkg/container/docker/errors.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ var (
3232
// Deprecated: Use runtime.ErrContainerExited.
3333
ErrContainerExited = runtime.ErrContainerExited
3434

35+
// Deprecated: Use runtime.ErrContainerRestarted.
36+
ErrContainerRestarted = runtime.ErrContainerRestarted
37+
3538
// Deprecated: Use runtime.ErrContainerRemoved.
3639
ErrContainerRemoved = runtime.ErrContainerRemoved
3740

pkg/container/runtime/errors.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@ var (
2222
// ErrContainerExited is returned when a container has exited unexpectedly
2323
ErrContainerExited = httperr.WithCode(fmt.Errorf("container exited unexpectedly"), http.StatusBadRequest)
2424

25+
// ErrContainerRestarted is returned when a container has been restarted
26+
// (e.g., by Docker restart policy). The container is still running.
27+
// This is distinct from ErrContainerExited.
28+
ErrContainerRestarted = httperr.WithCode(fmt.Errorf("container restarted"), http.StatusBadRequest)
29+
2530
// ErrContainerRemoved is returned when a container has been removed
2631
ErrContainerRemoved = httperr.WithCode(fmt.Errorf("container removed"), http.StatusBadRequest)
2732
)

pkg/container/runtime/monitor.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ func (m *WorkloadMonitor) monitor(ctx context.Context) {
148148
if err == nil && !info.StartedAt.IsZero() && !info.StartedAt.Equal(m.initialStartTime) {
149149
// Container was restarted (has a different start time)
150150
restartErr := NewContainerError(
151-
ErrContainerExited,
151+
ErrContainerRestarted,
152152
m.containerName,
153153
fmt.Sprintf("Container %s was restarted (start time changed from %s to %s)",
154154
m.containerName, m.initialStartTime.Format(time.RFC3339), info.StartedAt.Format(time.RFC3339)),

pkg/container/runtime/monitor_test.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -262,8 +262,7 @@ func TestWorkloadMonitor_ContainerRestarted(t *testing.T) {
262262
select {
263263
case exitErr := <-ch:
264264
require.Error(t, exitErr)
265-
assert.ErrorIs(t, exitErr, runtime.ErrContainerExited)
266-
assert.Contains(t, exitErr.Error(), "restarted")
265+
assert.ErrorIs(t, exitErr, runtime.ErrContainerRestarted)
267266
case <-time.After(15 * time.Second):
268267
t.Fatal("timed out waiting for container restart error")
269268
}

pkg/transport/http.go

Lines changed: 90 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,9 @@ type HTTPTransport struct {
8080
shutdownCh chan struct{}
8181

8282
// Container monitor
83-
monitor rt.Monitor
84-
errorCh <-chan error
83+
monitor rt.Monitor
84+
monitorRuntime rt.Runtime // Stored for monitor reconnection on container restart
85+
errorCh <-chan error
8586

8687
// Container exit error (for determining if restart is needed)
8788
containerExitErr error
@@ -329,6 +330,7 @@ func (t *HTTPTransport) Start(ctx context.Context) error {
329330
if err != nil {
330331
return fmt.Errorf("failed to create container monitor: %w", err)
331332
}
333+
t.monitorRuntime = monitorRuntime // Store for reconnection
332334
t.monitor = container.NewMonitor(monitorRuntime, t.containerName)
333335

334336
// Start monitoring the container
@@ -348,8 +350,15 @@ func (t *HTTPTransport) Stop(ctx context.Context) error {
348350
t.mutex.Lock()
349351
defer t.mutex.Unlock()
350352

351-
// Signal shutdown
352-
close(t.shutdownCh)
353+
// Signal shutdown (guard against double-close if Stop is called
354+
// both from handleContainerExit and externally by the runner)
355+
select {
356+
case <-t.shutdownCh:
357+
// Already closed/stopping
358+
return nil
359+
default:
360+
close(t.shutdownCh)
361+
}
353362

354363
// For remote MCP servers, we don't need container monitoring
355364
if t.remoteURL == "" {
@@ -378,39 +387,89 @@ func (t *HTTPTransport) Stop(ctx context.Context) error {
378387
}
379388

380389
// handleContainerExit handles container exit events.
390+
// It loops to support reconnecting the monitor when a container is restarted
391+
// by Docker (e.g., via restart policy) rather than truly exiting.
381392
func (t *HTTPTransport) handleContainerExit(ctx context.Context) {
382-
select {
383-
case <-ctx.Done():
384-
return
385-
case err := <-t.errorCh:
386-
// Store the exit error so runner can check if restart is needed
387-
t.exitErrMutex.Lock()
388-
t.containerExitErr = err
389-
t.exitErrMutex.Unlock()
390-
391-
//nolint:gosec // G706: logging container name from config
392-
slog.Warn("container exited", "container", t.containerName, "error", err)
393+
for {
394+
select {
395+
case <-ctx.Done():
396+
return
397+
case <-t.shutdownCh:
398+
return
399+
case err := <-t.errorCh:
400+
t.exitErrMutex.Lock()
401+
t.containerExitErr = err
402+
t.exitErrMutex.Unlock()
403+
404+
if errors.Is(err, rt.ErrContainerRestarted) {
405+
//nolint:gosec // G706: logging container name from config
406+
slog.Debug("container was restarted by Docker, reconnecting monitor",
407+
"container", t.containerName)
408+
if reconnectErr := t.reconnectMonitor(ctx); reconnectErr != nil {
409+
//nolint:gosec // G706: logging container name from config
410+
slog.Error("failed to reconnect monitor, stopping transport",
411+
"container", t.containerName, "error", reconnectErr)
412+
} else {
413+
t.exitErrMutex.Lock()
414+
t.containerExitErr = nil
415+
t.exitErrMutex.Unlock()
416+
continue
417+
}
418+
}
393419

394-
// Check if container was removed (not just exited) using typed error
395-
if errors.Is(err, rt.ErrContainerRemoved) {
396-
//nolint:gosec // G706: logging container name from config
397-
slog.Debug("container was removed, stopping proxy and cleaning up",
398-
"container", t.containerName)
399-
} else {
400420
//nolint:gosec // G706: logging container name from config
401-
slog.Debug("container exited, will attempt automatic restart",
402-
"container", t.containerName)
403-
}
421+
slog.Warn("container exited", "container", t.containerName, "error", err)
422+
423+
// Check if container was removed (not just exited) using typed error
424+
if errors.Is(err, rt.ErrContainerRemoved) {
425+
//nolint:gosec // G706: logging container name from config
426+
slog.Debug("container was removed, stopping proxy and cleaning up",
427+
"container", t.containerName)
428+
} else {
429+
//nolint:gosec // G706: logging container name from config
430+
slog.Debug("container exited, will attempt automatic restart",
431+
"container", t.containerName)
432+
}
404433

405-
// Stop the transport when the container exits/removed
406-
if stopErr := t.Stop(ctx); stopErr != nil {
407-
slog.Error("error stopping transport after container exit", "error", stopErr)
434+
// Stop the transport when the container exits/removed
435+
if stopErr := t.Stop(ctx); stopErr != nil {
436+
slog.Error("error stopping transport after container exit", "error", stopErr)
437+
}
438+
return
408439
}
409440
}
410441
}
411442

443+
// reconnectMonitor stops the current monitor and starts a new one.
444+
// This is used when a container is restarted by Docker -- the proxy keeps running
445+
// but the monitor needs to track the new container start time.
446+
func (t *HTTPTransport) reconnectMonitor(ctx context.Context) error {
447+
t.mutex.Lock()
448+
defer t.mutex.Unlock()
449+
450+
// Stop the old monitor (safe even if goroutine already returned)
451+
if t.monitor != nil {
452+
t.monitor.StopMonitoring()
453+
}
454+
455+
// Create a new monitor that records the current (post-restart) start time
456+
t.monitor = container.NewMonitor(t.monitorRuntime, t.containerName)
457+
458+
// Start monitoring — errorCh is reassigned here, which is safe because
459+
// handleContainerExit (the only reader) runs on the same goroutine and
460+
// will see the new channel when it re-enters the select after continue.
461+
var err error
462+
t.errorCh, err = t.monitor.StartMonitoring(ctx)
463+
if err != nil {
464+
return fmt.Errorf("failed to restart container monitoring: %w", err)
465+
}
466+
467+
return nil
468+
}
469+
412470
// ShouldRestart returns true if the container exited and should be restarted.
413-
// Returns false if the container was removed (intentionally deleted).
471+
// Returns false if the container was removed (intentionally deleted) or
472+
// restarted by Docker (already running, no ToolHive restart needed).
414473
func (t *HTTPTransport) ShouldRestart() bool {
415474
t.exitErrMutex.Lock()
416475
defer t.exitErrMutex.Unlock()
@@ -419,8 +478,9 @@ func (t *HTTPTransport) ShouldRestart() bool {
419478
return false // No exit error, normal shutdown
420479
}
421480

422-
// Don't restart if container was removed (use typed error check)
423-
return !errors.Is(t.containerExitErr, rt.ErrContainerRemoved)
481+
// Don't restart if container was removed or restarted by Docker (use typed error check)
482+
return !errors.Is(t.containerExitErr, rt.ErrContainerRemoved) &&
483+
!errors.Is(t.containerExitErr, rt.ErrContainerRestarted)
424484
}
425485

426486
// IsRunning checks if the transport is currently running.

pkg/transport/http_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ func TestHTTPTransport_ShouldRestart(t *testing.T) {
3737
exitError: rt.NewContainerError(rt.ErrContainerRemoved, "test", "Container removed"),
3838
expectedResult: false,
3939
},
40+
{
41+
name: "container restarted by Docker - should not restart",
42+
exitError: rt.NewContainerError(rt.ErrContainerRestarted, "test", "Container restarted"),
43+
expectedResult: false,
44+
},
4045
{
4146
name: "no error - should not restart",
4247
exitError: nil,

pkg/transport/stdio.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -705,7 +705,8 @@ func (t *StdioTransport) handleContainerExit(ctx context.Context) {
705705
}
706706

707707
// ShouldRestart returns true if the container exited and should be restarted.
708-
// Returns false if the container was removed (intentionally deleted).
708+
// Returns false if the container was removed (intentionally deleted) or
709+
// restarted by Docker (already running, no ToolHive restart needed).
709710
func (t *StdioTransport) ShouldRestart() bool {
710711
t.exitErrMutex.Lock()
711712
defer t.exitErrMutex.Unlock()
@@ -714,6 +715,7 @@ func (t *StdioTransport) ShouldRestart() bool {
714715
return false // No exit error, normal shutdown
715716
}
716717

717-
// Don't restart if container was removed (use typed error check)
718-
return !errors.Is(t.containerExitErr, rt.ErrContainerRemoved)
718+
// Don't restart if container was removed or restarted by Docker (use typed error check)
719+
return !errors.Is(t.containerExitErr, rt.ErrContainerRemoved) &&
720+
!errors.Is(t.containerExitErr, rt.ErrContainerRestarted)
719721
}

0 commit comments

Comments
 (0)