From 0fd3a3d6427ccb7a0e8a5d9c89ac261943c74095 Mon Sep 17 00:00:00 2001
From: Daniel Kelly <danielkelly89@googlemail.com>
Date: Wed, 29 Oct 2025 12:43:09 +0100
Subject: [PATCH 1/6] feat: add health check endpoints and service metrics

- Added /health, /readiness, /liveness and /metrics endpoints for monitoring service status
- Implemented request tracking in ModelRequestProcessor to count requests and record last prediction time
- Added service instance ID and startup time tracking for monitoring
- Added GPU memory metrics collection using pynvml when available
- Enhanced readiness check to verify model loading status and GPU availability
- Added detailed metrics endpoint providing
---
 clearml_serving/serving/main.py               | 154 +++++++++++++++++-
 .../serving/model_request_processor.py        |  27 +++
 2 files changed, 180 insertions(+), 1 deletion(-)

diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py
index 9f36c0f..bc52f1e 100644
--- a/clearml_serving/serving/main.py
+++ b/clearml_serving/serving/main.py
@@ -3,10 +3,12 @@
 import traceback
 import gzip
 import asyncio
+import time
+import uuid
 
 from fastapi import FastAPI, Request, Response, APIRouter, HTTPException, Depends
 from fastapi.routing import APIRoute
-from fastapi.responses import PlainTextResponse
+from fastapi.responses import PlainTextResponse, JSONResponse
 from grpc.aio import AioRpcError
 
 from http import HTTPStatus
@@ -57,6 +59,11 @@ async def custom_route_handler(request: Request) -> Response:
 
 # create clearml Task and load models
 serving_service_task_id, session_logger, instance_id = setup_task()
+
+# Health check tracking variables
+startup_time = time.time()
+service_instance_id = str(uuid.uuid4())[:8]
+
 # polling frequency
 model_sync_frequency_secs = 5
 try:
@@ -180,6 +187,151 @@ async def process_with_exceptions(
     return return_value
 
 
+# ============================================================================
+# HEALTH CHECK ENDPOINTS
+# ============================================================================
+
+@app.get("/health")
+async def health_check():
+    """
+    Basic health check endpoint.
+    Returns 200 OK when service is running.
+    """
+    return JSONResponse(
+        status_code=200,
+        content={
+            "status": "healthy",
+            "service": "clearml-serving",
+            "version": __version__,
+            "timestamp": time.time(),
+            "instance_id": service_instance_id,
+        },
+    )
+
+
+@app.get("/readiness")
+async def readiness_check():
+    """
+    Readiness check endpoint.
+    Returns 200 if ready to serve requests, 503 if not ready.
+    Checks if ModelRequestProcessor is initialized and models are loaded.
+    """
+    global processor
+
+    if not processor:
+        raise HTTPException(
+            status_code=503,
+            detail={
+                "status": "not_ready",
+                "reason": "Processor not initialized",
+                "timestamp": time.time(),
+            },
+        )
+
+    try:
+        # Check if models are loaded
+        models_loaded = processor.get_loaded_endpoints()
+        if not models_loaded:
+            raise HTTPException(
+                status_code=503,
+                detail={
+                    "status": "not_ready",
+                    "reason": "No models loaded",
+                    "timestamp": time.time(),
+                },
+            )
+
+        # Check GPU availability if applicable
+        gpu_available = False
+        try:
+            import torch
+
+            gpu_available = torch.cuda.is_available()
+        except (ImportError, ModuleNotFoundError, AttributeError):
+            # torch not installed or CUDA not available
+            pass
+
+        return JSONResponse(
+            status_code=200,
+            content={
+                "status": "ready",
+                "models_loaded": len(models_loaded),
+                "gpu_available": gpu_available,
+                "timestamp": time.time(),
+            },
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(
+            status_code=503,
+            detail={
+                "status": "not_ready",
+                "reason": f"Error checking readiness: {str(e)}",
+                "timestamp": time.time(),
+            },
+        )
+
+
+@app.get("/liveness")
+async def liveness_check():
+    """
+    Liveness check endpoint.
+    Lightweight check for container orchestration.
+    Returns 200 OK if process is responsive.
+    """
+    return JSONResponse(
+        status_code=200, content={"status": "alive", "timestamp": time.time()}
+    )
+
+
+@app.get("/metrics")
+async def metrics_endpoint():
+    """
+    Detailed metrics endpoint.
+    Returns service metrics including uptime, request count, GPU usage, etc.
+    """
+    global processor
+
+    uptime_seconds = time.time() - startup_time
+
+    metrics = {
+        "uptime_seconds": round(uptime_seconds, 2),
+        "total_requests": 0,
+        "last_prediction_timestamp": None,
+        "models": [],
+    }
+
+    if processor:
+        try:
+            metrics["total_requests"] = processor.get_request_count()
+            metrics["last_prediction_timestamp"] = processor.get_last_prediction_time()
+
+            # Get loaded models info
+            loaded_endpoints = processor.get_loaded_endpoints()
+            for endpoint_name in loaded_endpoints:
+                metrics["models"].append({"endpoint": endpoint_name, "loaded": True})
+        except AttributeError:
+            # If methods don't exist yet, continue with basic metrics
+            pass
+
+    # Try to get GPU metrics
+    try:
+        import pynvml
+
+        pynvml.nvmlInit()
+        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+        metrics["gpu_memory_used_mb"] = round(info.used / 1024 / 1024, 2)
+        metrics["gpu_memory_total_mb"] = round(info.total / 1024 / 1024, 2)
+        pynvml.nvmlShutdown()
+    except (ImportError, ModuleNotFoundError, AttributeError, OSError):
+        # GPU metrics not available (pynvml not installed, no GPU, or driver issues)
+        metrics["gpu_memory_used_mb"] = None
+        metrics["gpu_memory_total_mb"] = None
+
+    return JSONResponse(status_code=200, content=metrics)
+
 router = APIRouter(
     prefix=f"/{os.environ.get("CLEARML_DEFAULT_SERVE_SUFFIX", "serve")}",
     tags=["models"],
diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py
index 60c256d..2b7b27c 100644
--- a/clearml_serving/serving/model_request_processor.py
+++ b/clearml_serving/serving/model_request_processor.py
@@ -161,6 +161,9 @@ def __init__(
         self._metric_log_freq = None
         self._endpoint_telemetry = {}
         self._enable_endpoint_telemetry = os.environ.get("CLEARML_ENABLE_ENDPOINT_TELEMETRY", "1") != "0"
+        # Health check tracking variables
+        self._request_count = 0
+        self._last_prediction_time = None
 
     def on_request_endpoint_telemetry(self, base_url=None, version=None):
         try:
@@ -255,6 +258,10 @@ async def process_request(self, base_url: str, version: str, request_body: dict,
         Process request coming in,
         Raise Value error if url does not match existing endpoints
         """
+        # Track request for health metrics
+        self._request_count += 1
+        self._last_prediction_time = time()
+        
         self._request_processing_state.inc()
         # check if we need to stall
         if self._update_lock_flag:
@@ -1567,3 +1574,23 @@ def _remove_registered_input_model(self, endpoint_url: str) -> bool:
             return False
 
         return True
+
+    def get_loaded_endpoints(self) -> List[str]:
+        """
+        Return list of loaded endpoint names for health checks.
+        """
+        if not hasattr(self, "_endpoints") or not self._endpoints:
+            return []
+        return list(self._endpoints.keys())
+
+    def get_request_count(self) -> int:
+        """
+        Return total requests processed for health metrics.
+        """
+        return getattr(self, "_request_count", 0)
+
+    def get_last_prediction_time(self) -> Optional[float]:
+        """
+        Return timestamp of last prediction for health metrics.
+        """
+        return getattr(self, "_last_prediction_time", None)

From cb3c923881531f633c8a8b0776b38490b069afc4 Mon Sep 17 00:00:00 2001
From: Daniel Kelly <danielkelly89@googlemail.com>
Date: Wed, 29 Oct 2025 13:25:48 +0100
Subject: [PATCH 2/6] fix: f-string syntax for Python 3.10 compatibility

---
 clearml_serving/serving/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py
index bc52f1e..cd11bea 100644
--- a/clearml_serving/serving/main.py
+++ b/clearml_serving/serving/main.py
@@ -333,7 +333,7 @@ async def metrics_endpoint():
     return JSONResponse(status_code=200, content=metrics)
 
 router = APIRouter(
-    prefix=f"/{os.environ.get("CLEARML_DEFAULT_SERVE_SUFFIX", "serve")}",
+    prefix=f"/{os.environ.get('CLEARML_DEFAULT_SERVE_SUFFIX', 'serve')}",
     tags=["models"],
     responses={404: {"description": "Model Serving Endpoint Not found"}},
     route_class=GzipRoute,  # mark-out to remove support for GZip content encoding

From 575cdaedd562e43c47ae30c7bbdd192df39921b1 Mon Sep 17 00:00:00 2001
From: Daniel Kelly <danielkelly89@googlemail.com>
Date: Wed, 29 Oct 2025 13:32:55 +0100
Subject: [PATCH 3/6] fix: make vllm imports and OpenAI endpoints optional for
 production compatibility

---
 clearml_serving/serving/main.py | 39 +++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py
index cd11bea..6e23617 100644
--- a/clearml_serving/serving/main.py
+++ b/clearml_serving/serving/main.py
@@ -13,7 +13,13 @@
 
 from http import HTTPStatus
 
-from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest
+try:
+    from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest
+    VLLM_AVAILABLE = True
+except ImportError:
+    ChatCompletionRequest = None
+    CompletionRequest = None
+    VLLM_AVAILABLE = False
 
 from starlette.background import BackgroundTask
 
@@ -366,20 +372,21 @@ async def validate_json_request(raw_request: Request):
             detail="Unsupported Media Type: Only 'application/json' is allowed"
         )
 
-@router.post("/openai/{endpoint_type:path}", dependencies=[Depends(validate_json_request)])
-@router.get("/openai/{endpoint_type:path}", dependencies=[Depends(validate_json_request)])
-async def openai_serve_model(
-    endpoint_type: str,
-    request: Union[CompletionRequest, ChatCompletionRequest],
-    raw_request: Request
-):
-    combined_request = {"request": request, "raw_request": raw_request}
-    return_value = await process_with_exceptions(
-        base_url=request.model,
-        version=None,
-        request=combined_request,
-        serve_type=endpoint_type
-    )
-    return return_value
+if VLLM_AVAILABLE:
+    @router.post("/openai/{endpoint_type:path}", dependencies=[Depends(validate_json_request)])
+    @router.get("/openai/{endpoint_type:path}", dependencies=[Depends(validate_json_request)])
+    async def openai_serve_model(
+        endpoint_type: str,
+        request: Union[CompletionRequest, ChatCompletionRequest],
+        raw_request: Request
+    ):
+        combined_request = {"request": request, "raw_request": raw_request}
+        return_value = await process_with_exceptions(
+            base_url=request.model,
+            version=None,
+            request=combined_request,
+            serve_type=endpoint_type
+        )
+        return return_value
 
 app.include_router(router)

From 173622c07502b24abf6f4b8e7063e1525f837cd4 Mon Sep 17 00:00:00 2001
From: Daniel Kelly <danielkelly89@googlemail.com>
Date: Wed, 29 Oct 2025 13:52:39 +0100
Subject: [PATCH 4/6] fix indent

---
 clearml_serving/serving/main.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py
index 6e23617..c2efde3 100644
--- a/clearml_serving/serving/main.py
+++ b/clearml_serving/serving/main.py
@@ -13,13 +13,7 @@
 
 from http import HTTPStatus
 
-try:
-    from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest
-    VLLM_AVAILABLE = True
-except ImportError:
-    ChatCompletionRequest = None
-    CompletionRequest = None
-    VLLM_AVAILABLE = False
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest
 
 from starlette.background import BackgroundTask
 
@@ -339,7 +333,7 @@ async def metrics_endpoint():
     return JSONResponse(status_code=200, content=metrics)
 
 router = APIRouter(
-    prefix=f"/{os.environ.get('CLEARML_DEFAULT_SERVE_SUFFIX', 'serve')}",
+    prefix=f"/{os.environ.get("CLEARML_DEFAULT_SERVE_SUFFIX", "serve")}",
     tags=["models"],
     responses={404: {"description": "Model Serving Endpoint Not found"}},
     route_class=GzipRoute,  # mark-out to remove support for GZip content encoding

From 4877e7e07d74ce4cad19399b42168da7ae484586 Mon Sep 17 00:00:00 2001
From: Daniel Kelly <danielkelly89@googlemail.com>
Date: Wed, 29 Oct 2025 19:13:36 +0100
Subject: [PATCH 5/6] docs: add health check endpoints documentation

---
 README.md | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/README.md b/README.md
index ebf39e3..88dce3c 100644
--- a/README.md
+++ b/README.md
@@ -299,6 +299,41 @@ Grafana model performance example:
 - Multi-Model ASync Pipeline [example](examples/pipeline/async_preprocess.py) - multiple models
 - Custom Model [example](examples/custom/readme.md) - custom data
 
+## Health Check Endpoints
+
+ClearML Serving provides standard health check endpoints for monitoring and orchestration:
+
+- `GET /health` - Basic service health check
+  - Returns service status, version, and timestamp
+  - Response example:
+    ```json
+    {
+      "status": "healthy",
+      "service": "clearml-serving",
+      "version": "1.5.0",
+      "timestamp": 1729700000.0,
+      "instance_id": "a1b2c3d4"
+    }
+    ```
+
+- `GET /readiness` - Service readiness check
+  - Verifies if the service is ready to accept traffic
+  - Checks model loading status and GPU availability
+  - Returns 200 OK when ready, 503 Service Unavailable if not
+
+- `GET /liveness` - Simple liveness check
+  - Lightweight endpoint for container orchestration
+  - Returns 200 OK if the service process is responsive
+
+- `GET /metrics` - Service metrics
+  - Returns Prometheus-style metrics including:
+    - Uptime
+    - Request counts
+    - Model loading status
+    - GPU memory usage (if available)
+
+These endpoints are automatically enabled and require no additional configuration.
+
 ### :pray: Status
 
   - [x] FastAPI integration for inference service
@@ -330,6 +365,7 @@ Grafana model performance example:
   - [x] Prometheus install instructions
   - [x] Grafana install instructions
   - [x] Kubernetes Helm Chart
+  - [x] Standard health check endpoints (`/health`, `/readiness`, `/liveness`, `/metrics`)
   - [ ] Intel optimized container (python, numpy, daal, scikit-learn)
 
 ## Contributing

From c330bcb52037c063bf01f430e9cd3eebcf4a71c4 Mon Sep 17 00:00:00 2001
From: Daniel Kelly <danielkelly89@googlemail.com>
Date: Wed, 29 Oct 2025 19:37:05 +0100
Subject: [PATCH 6/6] add back how it was

---
 clearml_serving/serving/main.py | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py
index c2efde3..bc52f1e 100644
--- a/clearml_serving/serving/main.py
+++ b/clearml_serving/serving/main.py
@@ -366,21 +366,20 @@ async def validate_json_request(raw_request: Request):
             detail="Unsupported Media Type: Only 'application/json' is allowed"
         )
 
-if VLLM_AVAILABLE:
-    @router.post("/openai/{endpoint_type:path}", dependencies=[Depends(validate_json_request)])
-    @router.get("/openai/{endpoint_type:path}", dependencies=[Depends(validate_json_request)])
-    async def openai_serve_model(
-        endpoint_type: str,
-        request: Union[CompletionRequest, ChatCompletionRequest],
-        raw_request: Request
-    ):
-        combined_request = {"request": request, "raw_request": raw_request}
-        return_value = await process_with_exceptions(
-            base_url=request.model,
-            version=None,
-            request=combined_request,
-            serve_type=endpoint_type
-        )
-        return return_value
+@router.post("/openai/{endpoint_type:path}", dependencies=[Depends(validate_json_request)])
+@router.get("/openai/{endpoint_type:path}", dependencies=[Depends(validate_json_request)])
+async def openai_serve_model(
+    endpoint_type: str,
+    request: Union[CompletionRequest, ChatCompletionRequest],
+    raw_request: Request
+):
+    combined_request = {"request": request, "raw_request": raw_request}
+    return_value = await process_with_exceptions(
+        base_url=request.model,
+        version=None,
+        request=combined_request,
+        serve_type=endpoint_type
+    )
+    return return_value
 
 app.include_router(router)