diff --git a/changes/2263.enhance.md b/changes/2263.enhance.md new file mode 100644 index 00000000000..9d294a283e7 --- /dev/null +++ b/changes/2263.enhance.md @@ -0,0 +1 @@ +Fetch all containers eagerly when matching agent's registry to containers. diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py index 3b47fae6b7e..f8740dacdee 100644 --- a/src/ai/backend/agent/agent.py +++ b/src/ai/backend/agent/agent.py @@ -1259,10 +1259,17 @@ async def sync_container_lifecycles(self, interval: float) -> None: own_kernels: dict[KernelId, ContainerId] = {} terminated_kernels = {} + _containers = await self.enumerate_containers(DEAD_STATUS_SET | ACTIVE_STATUS_SET) + async with self.registry_lock: try: # Check if: there are dead containers - for kernel_id, container in await self.enumerate_containers(DEAD_STATUS_SET): + dead_containers = [ + (kid, container) + for kid, container in _containers + if container.status in DEAD_STATUS_SET + ] + for kernel_id, container in dead_containers: if ( kernel_id in self.restarting_kernels or kernel_id in self.terminating_kernels @@ -1281,7 +1288,12 @@ async def sync_container_lifecycles(self, interval: float) -> None: LifecycleEvent.CLEAN, KernelLifecycleEventReason.SELF_TERMINATED, ) - for kernel_id, container in await self.enumerate_containers(ACTIVE_STATUS_SET): + alive_containers = [ + (kid, container) + for kid, container in _containers + if container.status in ACTIVE_STATUS_SET + ] + for kernel_id, container in alive_containers: alive_kernels[kernel_id] = container.id session_id = SessionId(UUID(container.labels["ai.backend.session-id"])) kernel_session_map[kernel_id] = session_id