From 0fd3a3d6427ccb7a0e8a5d9c89ac261943c74095 Mon Sep 17 00:00:00 2001 From: Daniel Kelly Date: Wed, 29 Oct 2025 12:43:09 +0100 Subject: [PATCH 1/6] feat: add health check endpoints and service metrics - Added /health, /readiness, /liveness and /metrics endpoints for monitoring service status - Implemented request tracking in ModelRequestProcessor to count requests and record last prediction time - Added service instance ID and startup time tracking for monitoring - Added GPU memory metrics collection using pynvml when available - Enhanced readiness check to verify model loading status and GPU availability - Added detailed metrics endpoint providing --- clearml_serving/serving/main.py | 154 +++++++++++++++++- .../serving/model_request_processor.py | 27 +++ 2 files changed, 180 insertions(+), 1 deletion(-) diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index 9f36c0f..bc52f1e 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -3,10 +3,12 @@ import traceback import gzip import asyncio +import time +import uuid from fastapi import FastAPI, Request, Response, APIRouter, HTTPException, Depends from fastapi.routing import APIRoute -from fastapi.responses import PlainTextResponse +from fastapi.responses import PlainTextResponse, JSONResponse from grpc.aio import AioRpcError from http import HTTPStatus @@ -57,6 +59,11 @@ async def custom_route_handler(request: Request) -> Response: # create clearml Task and load models serving_service_task_id, session_logger, instance_id = setup_task() + +# Health check tracking variables +startup_time = time.time() +service_instance_id = str(uuid.uuid4())[:8] + # polling frequency model_sync_frequency_secs = 5 try: @@ -180,6 +187,151 @@ async def process_with_exceptions( return return_value +# ============================================================================ +# HEALTH CHECK ENDPOINTS +# ============================================================================ + +@app.get("/health") +async def health_check(): + """ + Basic health check endpoint. + Returns 200 OK when service is running. + """ + return JSONResponse( + status_code=200, + content={ + "status": "healthy", + "service": "clearml-serving", + "version": __version__, + "timestamp": time.time(), + "instance_id": service_instance_id, + }, + ) + + +@app.get("/readiness") +async def readiness_check(): + """ + Readiness check endpoint. + Returns 200 if ready to serve requests, 503 if not ready. + Checks if ModelRequestProcessor is initialized and models are loaded. + """ + global processor + + if not processor: + raise HTTPException( + status_code=503, + detail={ + "status": "not_ready", + "reason": "Processor not initialized", + "timestamp": time.time(), + }, + ) + + try: + # Check if models are loaded + models_loaded = processor.get_loaded_endpoints() + if not models_loaded: + raise HTTPException( + status_code=503, + detail={ + "status": "not_ready", + "reason": "No models loaded", + "timestamp": time.time(), + }, + ) + + # Check GPU availability if applicable + gpu_available = False + try: + import torch + + gpu_available = torch.cuda.is_available() + except (ImportError, ModuleNotFoundError, AttributeError): + # torch not installed or CUDA not available + pass + + return JSONResponse( + status_code=200, + content={ + "status": "ready", + "models_loaded": len(models_loaded), + "gpu_available": gpu_available, + "timestamp": time.time(), + }, + ) + except HTTPException: + raise + except Exception as e: + raise HTTPException( + status_code=503, + detail={ + "status": "not_ready", + "reason": f"Error checking readiness: {str(e)}", + "timestamp": time.time(), + }, + ) + + +@app.get("/liveness") +async def liveness_check(): + """ + Liveness check endpoint. + Lightweight check for container orchestration. + Returns 200 OK if process is responsive. + """ + return JSONResponse( + status_code=200, content={"status": "alive", "timestamp": time.time()} + ) + + +@app.get("/metrics") +async def metrics_endpoint(): + """ + Detailed metrics endpoint. + Returns service metrics including uptime, request count, GPU usage, etc. + """ + global processor + + uptime_seconds = time.time() - startup_time + + metrics = { + "uptime_seconds": round(uptime_seconds, 2), + "total_requests": 0, + "last_prediction_timestamp": None, + "models": [], + } + + if processor: + try: + metrics["total_requests"] = processor.get_request_count() + metrics["last_prediction_timestamp"] = processor.get_last_prediction_time() + + # Get loaded models info + loaded_endpoints = processor.get_loaded_endpoints() + for endpoint_name in loaded_endpoints: + metrics["models"].append({"endpoint": endpoint_name, "loaded": True}) + except AttributeError: + # If methods don't exist yet, continue with basic metrics + pass + + # Try to get GPU metrics + try: + import pynvml + + pynvml.nvmlInit() + handle = pynvml.nvmlDeviceGetHandleByIndex(0) + info = pynvml.nvmlDeviceGetMemoryInfo(handle) + metrics["gpu_memory_used_mb"] = round(info.used / 1024 / 1024, 2) + metrics["gpu_memory_total_mb"] = round(info.total / 1024 / 1024, 2) + pynvml.nvmlShutdown() + except (ImportError, ModuleNotFoundError, AttributeError, OSError): + # GPU metrics not available (pynvml not installed, no GPU, or driver issues) + metrics["gpu_memory_used_mb"] = None + metrics["gpu_memory_total_mb"] = None + + return JSONResponse(status_code=200, content=metrics) + router = APIRouter( prefix=f"/{os.environ.get("CLEARML_DEFAULT_SERVE_SUFFIX", "serve")}", tags=["models"], diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 60c256d..2b7b27c 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -161,6 +161,9 @@ def __init__( self._metric_log_freq = None self._endpoint_telemetry = {} self._enable_endpoint_telemetry = os.environ.get("CLEARML_ENABLE_ENDPOINT_TELEMETRY", "1") != "0" + # Health check tracking variables + self._request_count = 0 + self._last_prediction_time = None def on_request_endpoint_telemetry(self, base_url=None, version=None): try: @@ -255,6 +258,10 @@ async def process_request(self, base_url: str, version: str, request_body: dict, Process request coming in, Raise Value error if url does not match existing endpoints """ + # Track request for health metrics + self._request_count += 1 + self._last_prediction_time = time() + self._request_processing_state.inc() # check if we need to stall if self._update_lock_flag: @@ -1567,3 +1574,23 @@ def _remove_registered_input_model(self, endpoint_url: str) -> bool: return False return True + + def get_loaded_endpoints(self) -> List[str]: + """ + Return list of loaded endpoint names for health checks. + """ + if not hasattr(self, "_endpoints") or not self._endpoints: + return [] + return list(self._endpoints.keys()) + + def get_request_count(self) -> int: + """ + Return total requests processed for health metrics. + """ + return getattr(self, "_request_count", 0) + + def get_last_prediction_time(self) -> Optional[float]: + """ + Return timestamp of last prediction for health metrics. + """ + return getattr(self, "_last_prediction_time", None) From cb3c923881531f633c8a8b0776b38490b069afc4 Mon Sep 17 00:00:00 2001 From: Daniel Kelly Date: Wed, 29 Oct 2025 13:25:48 +0100 Subject: [PATCH 2/6] fix: f-string syntax for Python 3.10 compatibility --- clearml_serving/serving/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index bc52f1e..cd11bea 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -333,7 +333,7 @@ async def metrics_endpoint(): return JSONResponse(status_code=200, content=metrics) router = APIRouter( - prefix=f"/{os.environ.get("CLEARML_DEFAULT_SERVE_SUFFIX", "serve")}", + prefix=f"/{os.environ.get('CLEARML_DEFAULT_SERVE_SUFFIX', 'serve')}", tags=["models"], responses={404: {"description": "Model Serving Endpoint Not found"}}, route_class=GzipRoute, # mark-out to remove support for GZip content encoding From 575cdaedd562e43c47ae30c7bbdd192df39921b1 Mon Sep 17 00:00:00 2001 From: Daniel Kelly Date: Wed, 29 Oct 2025 13:32:55 +0100 Subject: [PATCH 3/6] fix: make vllm imports and OpenAI endpoints optional for production compatibility --- clearml_serving/serving/main.py | 39 +++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index cd11bea..6e23617 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -13,7 +13,13 @@ from http import HTTPStatus -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest +try: + from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest + VLLM_AVAILABLE = True +except ImportError: + ChatCompletionRequest = None + CompletionRequest = None + VLLM_AVAILABLE = False from starlette.background import BackgroundTask @@ -366,20 +372,21 @@ async def validate_json_request(raw_request: Request): detail="Unsupported Media Type: Only 'application/json' is allowed" ) -@router.post("/openai/{endpoint_type:path}", dependencies=[Depends(validate_json_request)]) -@router.get("/openai/{endpoint_type:path}", dependencies=[Depends(validate_json_request)]) -async def openai_serve_model( - endpoint_type: str, - request: Union[CompletionRequest, ChatCompletionRequest], - raw_request: Request -): - combined_request = {"request": request, "raw_request": raw_request} - return_value = await process_with_exceptions( - base_url=request.model, - version=None, - request=combined_request, - serve_type=endpoint_type - ) - return return_value +if VLLM_AVAILABLE: + @router.post("/openai/{endpoint_type:path}", dependencies=[Depends(validate_json_request)]) + @router.get("/openai/{endpoint_type:path}", dependencies=[Depends(validate_json_request)]) + async def openai_serve_model( + endpoint_type: str, + request: Union[CompletionRequest, ChatCompletionRequest], + raw_request: Request + ): + combined_request = {"request": request, "raw_request": raw_request} + return_value = await process_with_exceptions( + base_url=request.model, + version=None, + request=combined_request, + serve_type=endpoint_type + ) + return return_value app.include_router(router) From 173622c07502b24abf6f4b8e7063e1525f837cd4 Mon Sep 17 00:00:00 2001 From: Daniel Kelly Date: Wed, 29 Oct 2025 13:52:39 +0100 Subject: [PATCH 4/6] fix indent --- clearml_serving/serving/main.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index 6e23617..c2efde3 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -13,13 +13,7 @@ from http import HTTPStatus -try: - from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest - VLLM_AVAILABLE = True -except ImportError: - ChatCompletionRequest = None - CompletionRequest = None - VLLM_AVAILABLE = False +from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest from starlette.background import BackgroundTask @@ -339,7 +333,7 @@ async def metrics_endpoint(): return JSONResponse(status_code=200, content=metrics) router = APIRouter( - prefix=f"/{os.environ.get('CLEARML_DEFAULT_SERVE_SUFFIX', 'serve')}", + prefix=f"/{os.environ.get("CLEARML_DEFAULT_SERVE_SUFFIX", "serve")}", tags=["models"], responses={404: {"description": "Model Serving Endpoint Not found"}}, route_class=GzipRoute, # mark-out to remove support for GZip content encoding From 4877e7e07d74ce4cad19399b42168da7ae484586 Mon Sep 17 00:00:00 2001 From: Daniel Kelly Date: Wed, 29 Oct 2025 19:13:36 +0100 Subject: [PATCH 5/6] docs: add health check endpoints documentation --- README.md | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/README.md b/README.md index ebf39e3..88dce3c 100644 --- a/README.md +++ b/README.md @@ -299,6 +299,41 @@ Grafana model performance example: - Multi-Model ASync Pipeline [example](examples/pipeline/async_preprocess.py) - multiple models - Custom Model [example](examples/custom/readme.md) - custom data +## Health Check Endpoints + +ClearML Serving provides standard health check endpoints for monitoring and orchestration: + +- `GET /health` - Basic service health check + - Returns service status, version, and timestamp + - Response example: + ```json + { + "status": "healthy", + "service": "clearml-serving", + "version": "1.5.0", + "timestamp": 1729700000.0, + "instance_id": "a1b2c3d4" + } + ``` + +- `GET /readiness` - Service readiness check + - Verifies if the service is ready to accept traffic + - Checks model loading status and GPU availability + - Returns 200 OK when ready, 503 Service Unavailable if not + +- `GET /liveness` - Simple liveness check + - Lightweight endpoint for container orchestration + - Returns 200 OK if the service process is responsive + +- `GET /metrics` - Service metrics + - Returns Prometheus-style metrics including: + - Uptime + - Request counts + - Model loading status + - GPU memory usage (if available) + +These endpoints are automatically enabled and require no additional configuration. + ### :pray: Status - [x] FastAPI integration for inference service @@ -330,6 +365,7 @@ Grafana model performance example: - [x] Prometheus install instructions - [x] Grafana install instructions - [x] Kubernetes Helm Chart + - [x] Standard health check endpoints (`/health`, `/readiness`, `/liveness`, `/metrics`) - [ ] Intel optimized container (python, numpy, daal, scikit-learn) ## Contributing From c330bcb52037c063bf01f430e9cd3eebcf4a71c4 Mon Sep 17 00:00:00 2001 From: Daniel Kelly Date: Wed, 29 Oct 2025 19:37:05 +0100 Subject: [PATCH 6/6] add back how it was --- clearml_serving/serving/main.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index c2efde3..bc52f1e 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -366,21 +366,20 @@ async def validate_json_request(raw_request: Request): detail="Unsupported Media Type: Only 'application/json' is allowed" ) -if VLLM_AVAILABLE: - @router.post("/openai/{endpoint_type:path}", dependencies=[Depends(validate_json_request)]) - @router.get("/openai/{endpoint_type:path}", dependencies=[Depends(validate_json_request)]) - async def openai_serve_model( - endpoint_type: str, - request: Union[CompletionRequest, ChatCompletionRequest], - raw_request: Request - ): - combined_request = {"request": request, "raw_request": raw_request} - return_value = await process_with_exceptions( - base_url=request.model, - version=None, - request=combined_request, - serve_type=endpoint_type - ) - return return_value +@router.post("/openai/{endpoint_type:path}", dependencies=[Depends(validate_json_request)]) +@router.get("/openai/{endpoint_type:path}", dependencies=[Depends(validate_json_request)]) +async def openai_serve_model( + endpoint_type: str, + request: Union[CompletionRequest, ChatCompletionRequest], + raw_request: Request +): + combined_request = {"request": request, "raw_request": raw_request} + return_value = await process_with_exceptions( + base_url=request.model, + version=None, + request=combined_request, + serve_type=endpoint_type + ) + return return_value app.include_router(router)