diff --git a/changes/2819.enhance.md b/changes/2819.enhance.md new file mode 100644 index 00000000000..8cb1c05b7cd --- /dev/null +++ b/changes/2819.enhance.md @@ -0,0 +1 @@ +Insert container id to kernel objects whose `container_id` field is missing. diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py index 79294f08fb5..858fe2e6f4f 100644 --- a/src/ai/backend/agent/agent.py +++ b/src/ai/backend/agent/agent.py @@ -1307,9 +1307,9 @@ async def sync_container_lifecycles(self, interval: float) -> None: for cases when we miss the container lifecycle events from the underlying implementation APIs due to the agent restarts or crashes. """ - known_kernels: Dict[KernelId, ContainerId | None] = {} - alive_kernels: Dict[KernelId, ContainerId] = {} - kernel_session_map: Dict[KernelId, SessionId] = {} + known_kernels: dict[KernelId, ContainerId | None] = {} + alive_kernels: dict[KernelId, ContainerId] = {} + kernel_session_map: dict[KernelId, SessionId] = {} own_kernels: dict[KernelId, ContainerId] = {} terminated_kernels: dict[KernelId, ContainerLifecycleEvent] = {} @@ -1406,6 +1406,22 @@ def _get_session_id(container: Container) -> SessionId | None: LifecycleEvent.DESTROY, KernelLifecycleEventReason.TERMINATED_UNKNOWN_CONTAINER, ) + # Check if: a kernel in my registry does not has a container id + for kernel_id, kernel_obj in self.kernel_registry.items(): + insert_cid = False + if (container_id := alive_kernels.get(kernel_id)) is None: + # kernel has already been registered in terminated_kernels. + # skip. + continue + try: + own_cid = kernel_obj.container_id + except (AttributeError, KeyError): + insert_cid = True + else: + if own_cid != str(container_id): + insert_cid = True + if insert_cid: + kernel_obj.container_id = container_id finally: # Enqueue the events. terminated_kernel_ids = ",".join([ diff --git a/src/ai/backend/agent/docker/agent.py b/src/ai/backend/agent/docker/agent.py index 0898eef0180..bd8c570b03c 100644 --- a/src/ai/backend/agent/docker/agent.py +++ b/src/ai/backend/agent/docker/agent.py @@ -1071,7 +1071,7 @@ async def _rollback_container_creation() -> None: sport["container_ports"] = created_host_ports return { - "container_id": container._id, + "container_id": ContainerId(cid), "kernel_host": advertised_kernel_host or container_bind_host, "repl_in_port": repl_in_port, "repl_out_port": repl_out_port, diff --git a/src/ai/backend/agent/kernel.py b/src/ai/backend/agent/kernel.py index 809586f83c9..bf8973479be 100644 --- a/src/ai/backend/agent/kernel.py +++ b/src/ai/backend/agent/kernel.py @@ -177,7 +177,7 @@ class AbstractKernel(UserDict, aobject, metaclass=ABCMeta): stats_enabled: bool # FIXME: apply TypedDict to data in Python 3.8 environ: Mapping[str, Any] - status: KernelLifecycleStatus + state: KernelLifecycleStatus _tasks: Set[asyncio.Task] diff --git a/src/ai/backend/agent/stats.py b/src/ai/backend/agent/stats.py index 654b21d7d8d..a31a1a96b50 100644 --- a/src/ai/backend/agent/stats.py +++ b/src/ai/backend/agent/stats.py @@ -428,8 +428,13 @@ async def collect_container_stat( kernel_id_map: Dict[ContainerId, KernelId] = {} for kid, info in self.agent.kernel_registry.items(): try: - cid = info["container_id"] - except KeyError: + cid = info.container_id + if cid is None: + log.warning( + f"collect_container_stat(): no container for kernel (kid:{kid}, state:{info.state})" + ) + continue + except (KeyError, AttributeError): log.warning("collect_container_stat(): no container for kernel {}", kid) else: kernel_id_map[ContainerId(cid)] = kid