diff --git a/changes/5868.fix.md b/changes/5868.fix.md new file mode 100644 index 00000000000..d9d57954db4 --- /dev/null +++ b/changes/5868.fix.md @@ -0,0 +1 @@ +Handle Storage Proxy connection error from Manager side diff --git a/src/ai/backend/manager/clients/storage_proxy/manager_facing_client.py b/src/ai/backend/manager/clients/storage_proxy/manager_facing_client.py index 9a9379ad019..54c8bfce1ad 100644 --- a/src/ai/backend/manager/clients/storage_proxy/manager_facing_client.py +++ b/src/ai/backend/manager/clients/storage_proxy/manager_facing_client.py @@ -33,7 +33,10 @@ from ai.backend.manager.clients.storage_proxy.base import StorageProxyHTTPClient from ai.backend.manager.decorators.client_decorator import create_layer_aware_client_decorator from ai.backend.manager.defs import DEFAULT_CHUNK_SIZE -from ai.backend.manager.errors.storage import UnexpectedStorageProxyResponseError +from ai.backend.manager.errors.storage import ( + StorageProxyConnectionError, + UnexpectedStorageProxyResponseError, +) client_decorator = create_layer_aware_client_decorator(LayerType.STORAGE_PROXY_CLIENT) @@ -57,7 +60,10 @@ async def get_volumes(self) -> Mapping[str, Any]: :return: Response containing volume information """ - return await self._client.request_with_response("GET", "volumes") + try: + return await self._client.request_with_response("GET", "volumes") + except aiohttp.ClientConnectionError as e: + raise StorageProxyConnectionError from e @client_decorator() async def create_folder( diff --git a/src/ai/backend/manager/clients/storage_proxy/session_manager.py b/src/ai/backend/manager/clients/storage_proxy/session_manager.py index 7d23b9ceab2..29e46920c8c 100644 --- a/src/ai/backend/manager/clients/storage_proxy/session_manager.py +++ b/src/ai/backend/manager/clients/storage_proxy/session_manager.py @@ -28,6 +28,7 @@ ) from ai.backend.manager.config.unified import VolumesConfig from ai.backend.manager.errors.storage import ( + StorageProxyConnectionError, StorageProxyNotFound, ) @@ -177,7 +178,11 @@ async def _fetch( proxy_name: str, client: StorageProxyManagerFacingClient, ) -> Iterable[tuple[str, VolumeInfo]]: - reply = await client.get_volumes() + try: + reply = await client.get_volumes() + except StorageProxyConnectionError: + log.warning("Failed to connect to storage proxy (name: {})", proxy_name) + return [] return ((proxy_name, volume_data) for volume_data in reply["volumes"]) for proxy_name, client in self._manager_facing_clients.items(): diff --git a/src/ai/backend/manager/errors/storage.py b/src/ai/backend/manager/errors/storage.py index 5c235239bde..48e4ede7665 100644 --- a/src/ai/backend/manager/errors/storage.py +++ b/src/ai/backend/manager/errors/storage.py @@ -282,6 +282,19 @@ def error_code(cls) -> ErrorCode: ) +class StorageProxyConnectionError(BackendAIError, web.HTTPClientError): + error_type = "https://api.backend.ai/probs/storage-proxy-connection-error" + error_title = "Failed to connect to the storage proxy." + + @classmethod + def error_code(cls) -> ErrorCode: + return ErrorCode( + domain=ErrorDomain.STORAGE_PROXY, + operation=ErrorOperation.REQUEST, + error_detail=ErrorDetail.UNREACHABLE, + ) + + class UnexpectedStorageProxyResponseError(BackendAIError, web.HTTPInternalServerError): error_type = "https://api.backend.ai/probs/unexpected-storage-proxy-response" error_title = "Unexpected response from storage proxy."