diff --git a/changes/2157.fix.md b/changes/2157.fix.md new file mode 100644 index 00000000000..c3fb0e63f71 --- /dev/null +++ b/changes/2157.fix.md @@ -0,0 +1 @@ +Correctly register and deregister Agent's terminating kernels diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py index ea489995482..06989a2a8e7 100644 --- a/src/ai/backend/agent/agent.py +++ b/src/ai/backend/agent/agent.py @@ -997,7 +997,6 @@ async def _handle_destroy_event(self, ev: ContainerLifecycleEvent) -> None: assert current_task is not None if ev.kernel_id not in self._ongoing_destruction_tasks: self._ongoing_destruction_tasks[ev.kernel_id] = current_task - self.terminating_kernels.add(ev.kernel_id) async with self.registry_lock: kernel_obj = self.kernel_registry.get(ev.kernel_id) if kernel_obj is None: @@ -1023,6 +1022,7 @@ async def _handle_destroy_event(self, ev: ContainerLifecycleEvent) -> None: if kernel_obj.runner is not None: await kernel_obj.runner.close() kernel_obj.clean_event = ev.done_future + self.terminating_kernels.add(ev.kernel_id) try: await self.destroy_kernel(ev.kernel_id, ev.container_id) except Exception as e: @@ -1042,6 +1042,10 @@ async def _handle_destroy_event(self, ev: ContainerLifecycleEvent) -> None: done_future=ev.done_future, ), ) + else: + # Items in `terminating_kernels` are deleted only in _handle_clean_event() + # Should delete the kernel_id that will not be cleaned + self.terminating_kernels.discard(ev.kernel_id) except asyncio.CancelledError: pass except Exception: @@ -1304,7 +1308,7 @@ async def sync_container_lifecycles(self, interval: float) -> None: LifecycleEvent.CLEAN, KernelLifecycleEventReason.SELF_TERMINATED, ) - # Check if: there are containers not spawned by me. + # Check if: there are containers not spawned by me or already deleted from my registry. for kernel_id in alive_kernels.keys() - known_kernels.keys(): if kernel_id in self.restarting_kernels: continue