Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/2819.enhance.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Insert container id to kernel objects whose `container_id` field is missing.
22 changes: 19 additions & 3 deletions src/ai/backend/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -1307,9 +1307,9 @@ async def sync_container_lifecycles(self, interval: float) -> None:
for cases when we miss the container lifecycle events from the underlying implementation APIs
due to the agent restarts or crashes.
"""
known_kernels: Dict[KernelId, ContainerId | None] = {}
alive_kernels: Dict[KernelId, ContainerId] = {}
kernel_session_map: Dict[KernelId, SessionId] = {}
known_kernels: dict[KernelId, ContainerId | None] = {}
alive_kernels: dict[KernelId, ContainerId] = {}
kernel_session_map: dict[KernelId, SessionId] = {}
own_kernels: dict[KernelId, ContainerId] = {}
terminated_kernels: dict[KernelId, ContainerLifecycleEvent] = {}

Expand Down Expand Up @@ -1406,6 +1406,22 @@ def _get_session_id(container: Container) -> SessionId | None:
LifecycleEvent.DESTROY,
KernelLifecycleEventReason.TERMINATED_UNKNOWN_CONTAINER,
)
# Check if: a kernel in my registry does not has a container id
for kernel_id, kernel_obj in self.kernel_registry.items():
insert_cid = False
if (container_id := alive_kernels.get(kernel_id)) is None:
# kernel has already been registered in terminated_kernels.
# skip.
continue
try:
own_cid = kernel_obj.container_id
except (AttributeError, KeyError):
insert_cid = True
else:
if own_cid != str(container_id):
insert_cid = True
if insert_cid:
kernel_obj.container_id = container_id
finally:
# Enqueue the events.
terminated_kernel_ids = ",".join([
Expand Down
2 changes: 1 addition & 1 deletion src/ai/backend/agent/docker/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -1071,7 +1071,7 @@ async def _rollback_container_creation() -> None:
sport["container_ports"] = created_host_ports

return {
"container_id": container._id,
"container_id": ContainerId(cid),
"kernel_host": advertised_kernel_host or container_bind_host,
"repl_in_port": repl_in_port,
"repl_out_port": repl_out_port,
Expand Down
2 changes: 1 addition & 1 deletion src/ai/backend/agent/kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ class AbstractKernel(UserDict, aobject, metaclass=ABCMeta):
stats_enabled: bool
# FIXME: apply TypedDict to data in Python 3.8
environ: Mapping[str, Any]
status: KernelLifecycleStatus
state: KernelLifecycleStatus

_tasks: Set[asyncio.Task]

Expand Down
9 changes: 7 additions & 2 deletions src/ai/backend/agent/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,8 +428,13 @@ async def collect_container_stat(
kernel_id_map: Dict[ContainerId, KernelId] = {}
for kid, info in self.agent.kernel_registry.items():
try:
cid = info["container_id"]
except KeyError:
cid = info.container_id
if cid is None:
log.warning(
f"collect_container_stat(): no container for kernel (kid:{kid}, state:{info.state})"
)
continue
except (KeyError, AttributeError):
log.warning("collect_container_stat(): no container for kernel {}", kid)
else:
kernel_id_map[ContainerId(cid)] = kid
Expand Down