From 235b95403ccfeb6bb89c9cff8ad414e9ec94df9f Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Mon, 9 Sep 2024 13:08:23 +0900 Subject: [PATCH 1/3] feat: Insert container id to kernel obj of registry --- src/ai/backend/agent/agent.py | 29 +++++++++++++++++++++++++--- src/ai/backend/agent/docker/agent.py | 2 +- src/ai/backend/agent/kernel.py | 2 +- src/ai/backend/agent/stats.py | 9 +++++++-- 4 files changed, 35 insertions(+), 7 deletions(-) diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py index 79294f08fb5..0c213197d78 100644 --- a/src/ai/backend/agent/agent.py +++ b/src/ai/backend/agent/agent.py @@ -1307,9 +1307,9 @@ async def sync_container_lifecycles(self, interval: float) -> None: for cases when we miss the container lifecycle events from the underlying implementation APIs due to the agent restarts or crashes. """ - known_kernels: Dict[KernelId, ContainerId | None] = {} - alive_kernels: Dict[KernelId, ContainerId] = {} - kernel_session_map: Dict[KernelId, SessionId] = {} + known_kernels: dict[KernelId, ContainerId | None] = {} + alive_kernels: dict[KernelId, ContainerId] = {} + kernel_session_map: dict[KernelId, SessionId] = {} own_kernels: dict[KernelId, ContainerId] = {} terminated_kernels: dict[KernelId, ContainerLifecycleEvent] = {} @@ -1323,6 +1323,16 @@ def _get_session_id(container: Container) -> SessionId | None: ) return None + def _get_kernel_id(container: Container) -> KernelId | None: + _kernel_id = container.labels.get("ai.backend.kernel-id") + try: + return KernelId(UUID(_kernel_id)) + except ValueError: + log.warning( + f"sync_container_lifecycles() invalid kernel-id (cid: {container.id}, kid:{_kernel_id})" + ) + return None + log.debug("sync_container_lifecycles(): triggered") try: _containers = await self.enumerate_containers(ACTIVE_STATUS_SET | DEAD_STATUS_SET) @@ -1406,6 +1416,19 @@ def _get_session_id(container: Container) -> SessionId | None: LifecycleEvent.DESTROY, KernelLifecycleEventReason.TERMINATED_UNKNOWN_CONTAINER, ) + # Check if: a kernel in my registry does not has a container id + for kernel_id, kernel_obj in self.kernel_registry.items(): + insert_cid = False + container_id = alive_kernels[kernel_id] + try: + own_cid = kernel_obj.container_id + except (AttributeError, KeyError): + insert_cid = True + else: + if own_cid != str(container_id): + insert_cid = True + if insert_cid: + kernel_obj.container_id = container_id finally: # Enqueue the events. terminated_kernel_ids = ",".join([ diff --git a/src/ai/backend/agent/docker/agent.py b/src/ai/backend/agent/docker/agent.py index 0898eef0180..bd8c570b03c 100644 --- a/src/ai/backend/agent/docker/agent.py +++ b/src/ai/backend/agent/docker/agent.py @@ -1071,7 +1071,7 @@ async def _rollback_container_creation() -> None: sport["container_ports"] = created_host_ports return { - "container_id": container._id, + "container_id": ContainerId(cid), "kernel_host": advertised_kernel_host or container_bind_host, "repl_in_port": repl_in_port, "repl_out_port": repl_out_port, diff --git a/src/ai/backend/agent/kernel.py b/src/ai/backend/agent/kernel.py index 809586f83c9..bf8973479be 100644 --- a/src/ai/backend/agent/kernel.py +++ b/src/ai/backend/agent/kernel.py @@ -177,7 +177,7 @@ class AbstractKernel(UserDict, aobject, metaclass=ABCMeta): stats_enabled: bool # FIXME: apply TypedDict to data in Python 3.8 environ: Mapping[str, Any] - status: KernelLifecycleStatus + state: KernelLifecycleStatus _tasks: Set[asyncio.Task] diff --git a/src/ai/backend/agent/stats.py b/src/ai/backend/agent/stats.py index 654b21d7d8d..a31a1a96b50 100644 --- a/src/ai/backend/agent/stats.py +++ b/src/ai/backend/agent/stats.py @@ -428,8 +428,13 @@ async def collect_container_stat( kernel_id_map: Dict[ContainerId, KernelId] = {} for kid, info in self.agent.kernel_registry.items(): try: - cid = info["container_id"] - except KeyError: + cid = info.container_id + if cid is None: + log.warning( + f"collect_container_stat(): no container for kernel (kid:{kid}, state:{info.state})" + ) + continue + except (KeyError, AttributeError): log.warning("collect_container_stat(): no container for kernel {}", kid) else: kernel_id_map[ContainerId(cid)] = kid From 0acb3fc5d15ab4ab4e5e52692e81251737c2f507 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Mon, 9 Sep 2024 13:34:55 +0900 Subject: [PATCH 2/3] do null check --- src/ai/backend/agent/agent.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py index 0c213197d78..858fe2e6f4f 100644 --- a/src/ai/backend/agent/agent.py +++ b/src/ai/backend/agent/agent.py @@ -1323,16 +1323,6 @@ def _get_session_id(container: Container) -> SessionId | None: ) return None - def _get_kernel_id(container: Container) -> KernelId | None: - _kernel_id = container.labels.get("ai.backend.kernel-id") - try: - return KernelId(UUID(_kernel_id)) - except ValueError: - log.warning( - f"sync_container_lifecycles() invalid kernel-id (cid: {container.id}, kid:{_kernel_id})" - ) - return None - log.debug("sync_container_lifecycles(): triggered") try: _containers = await self.enumerate_containers(ACTIVE_STATUS_SET | DEAD_STATUS_SET) @@ -1419,7 +1409,10 @@ def _get_kernel_id(container: Container) -> KernelId | None: # Check if: a kernel in my registry does not has a container id for kernel_id, kernel_obj in self.kernel_registry.items(): insert_cid = False - container_id = alive_kernels[kernel_id] + if (container_id := alive_kernels.get(kernel_id)) is None: + # kernel has already been registered in terminated_kernels. + # skip. + continue try: own_cid = kernel_obj.container_id except (AttributeError, KeyError): From b9c3b43bbc4bdeb0ce157debddeb845c3ad7dabe Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Mon, 9 Sep 2024 13:40:16 +0900 Subject: [PATCH 3/3] add news fragment --- changes/2819.enhance.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/2819.enhance.md diff --git a/changes/2819.enhance.md b/changes/2819.enhance.md new file mode 100644 index 00000000000..8cb1c05b7cd --- /dev/null +++ b/changes/2819.enhance.md @@ -0,0 +1 @@ +Insert container id to kernel objects whose `container_id` field is missing.