From 49cf51235874175d3901c15f3b86cb328f8d473e Mon Sep 17 00:00:00 2001
From: "tzulingk@nvidia.com" <tzulingk@nvidia.com>
Date: Mon, 18 Aug 2025 16:53:54 -0700
Subject: [PATCH 1/4] Correct the metric name and labels.

---
 .../grafana_dashboards/grafana-dynamo-dashboard.json   | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json b/deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json
index 31a2e9058d..268734a6da 100644
--- a/deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json
+++ b/deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json
@@ -695,7 +695,7 @@
       "targets": [
         {
           "editorMode": "code",
-          "expr": "rate(dynamo_response_bytes_total{endpoint=\"generate\"}[1m])",
+          "expr": "rate(dynamo_component_response_bytes_total{dynamo_endpoint=\"generate\"}[1m])",
           "legendFormat": "Response bytes",
           "range": true,
           "refId": "A"
@@ -706,7 +706,7 @@
             "uid": "P1809F7CD0C75ACF3"
           },
           "editorMode": "code",
-          "expr": "rate(dynamo_request_bytes_total{endpoint=\"generate\"}[1m])",
+          "expr": "rate(dynamo_component_request_bytes_total{dynamo_endpoint=\"generate\"}[1m])",
           "hide": false,
           "instant": false,
           "legendFormat": "Request bytes",
@@ -800,7 +800,7 @@
       "targets": [
         {
           "editorMode": "code",
-          "expr": "rate(dynamo_requests_total{endpoint=\"generate\"}[1m])",
+          "expr": "rate(dynamo_component_requests_total{dynamo_endpoint=\"generate\"}[1m])",
           "legendFormat": "__auto",
           "range": true,
           "refId": "A"
@@ -905,7 +905,7 @@
       "targets": [
         {
           "editorMode": "code",
-          "expr": "dynamo_request_duration_seconds_sum / dynamo_request_duration_seconds_count",
+          "expr": "dynamo_component_request_duration_seconds_sum / dynamo_component_request_duration_seconds_count",
           "legendFormat": "__auto",
           "range": true,
           "refId": "A"
@@ -997,7 +997,7 @@
       "targets": [
         {
           "editorMode": "code",
-          "expr": "rate(dynamo_errors_total{endpoint=\"generate\"}[1m])",
+          "expr": "rate(dynamo_component_errors_total{dynamendpoint=\"generate\"}[1m])",
           "legendFormat": "{{error_type}}",
           "range": true,
           "refId": "A"

From bc615eb7c4e5ff14ee9ff80661d8a7e079bde155 Mon Sep 17 00:00:00 2001
From: "tzulingk@nvidia.com" <tzulingk@nvidia.com>
Date: Mon, 18 Aug 2025 20:34:40 -0700
Subject: [PATCH 2/4] Rename dynamo_component_concurrent_requests to
 dynamo_component_inflight_requests so that it matches the correspond frontend
 metric.

---
 deploy/metrics/README.md                         |  2 +-
 lib/runtime/examples/system_metrics/README.md    |  8 ++++----
 .../system_metrics/tests/integration_test.rs     |  4 ++--
 lib/runtime/src/metrics.rs                       |  6 +-----
 lib/runtime/src/metrics/prometheus_names.rs      |  2 +-
 .../src/pipeline/network/ingress/push_handler.rs | 16 ++++++++--------
 tests/router/test_router_e2e_with_mockers.py     |  6 +++---
 7 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/deploy/metrics/README.md b/deploy/metrics/README.md
index 42688ea2d7..054981aedd 100644
--- a/deploy/metrics/README.md
+++ b/deploy/metrics/README.md
@@ -40,7 +40,7 @@ As of Q2 2025, Dynamo HTTP Frontend metrics are exposed when you build container
 
 The core Dynamo backend system automatically exposes metrics with the `dynamo_component_*` prefix for all components that use the `DistributedRuntime` framework:
 
-- `dynamo_component_concurrent_requests`: Requests currently being processed (gauge)
+- `dynamo_component_inflight_requests`: Requests currently being processed (gauge)
 - `dynamo_component_request_bytes_total`: Total bytes received in requests (counter)
 - `dynamo_component_request_duration_seconds`: Request processing time (histogram)
 - `dynamo_component_requests_total`: Total requests processed (counter)
diff --git a/lib/runtime/examples/system_metrics/README.md b/lib/runtime/examples/system_metrics/README.md
index 2e1d320902..6ab654da41 100644
--- a/lib/runtime/examples/system_metrics/README.md
+++ b/lib/runtime/examples/system_metrics/README.md
@@ -62,7 +62,7 @@ The `dynamo_component_errors_total` metric includes the following error types:
 - `dynamo_component_request_duration_seconds` - Request processing time
 
 ### Gauges
-- `dynamo_component_concurrent_requests` - Number of requests currently being processed
+- `dynamo_component_inflight_requests` - Number of requests currently being processed
 
 ### Custom Metrics (Optional)
 - `dynamo_component_bytes_processed_total` - Total data bytes processed by system handler (example)
@@ -80,9 +80,9 @@ These labels are prefixed with "dynamo_" to avoid collisions with Kubernetes and
 When the system is running, you'll see metrics from http://<ip>:<port>/metrics like this:
 
 ```prometheus
-# HELP dynamo_component_concurrent_requests Number of requests currently being processed by component endpoint
-# TYPE dynamo_component_concurrent_requests gauge
-dynamo_component_concurrent_requests{dynamo_component="example_component",dynamo_endpoint="example_endpoint9881",dynamo_namespace="example_namespace"} 0
+# HELP dynamo_component_inflight_requests Number of requests currently being processed by component endpoint
+# TYPE dynamo_component_inflight_requests gauge
+dynamo_component_inflight_requests{dynamo_component="example_component",dynamo_endpoint="example_endpoint9881",dynamo_namespace="example_namespace"} 0
 
 # HELP dynamo_component_bytes_processed_total Example of a custom metric. Total number of data bytes processed by system handler
 # TYPE dynamo_component_bytes_processed_total counter
diff --git a/lib/runtime/examples/system_metrics/tests/integration_test.rs b/lib/runtime/examples/system_metrics/tests/integration_test.rs
index 030efd6477..d158dffbc9 100644
--- a/lib/runtime/examples/system_metrics/tests/integration_test.rs
+++ b/lib/runtime/examples/system_metrics/tests/integration_test.rs
@@ -118,7 +118,7 @@ async fn test_backend_with_metrics() -> Result<()> {
                 println!("{}", metrics_content);
                 println!("=== END METRICS CONTENT ===");
 
-                // Parse and verify ingress metrics are greater than 0 (except concurrent_requests)
+                // Parse and verify ingress metrics are greater than 0 (except inflight_requests)
                 verify_ingress_metrics_greater_than_0(&metrics_content);
 
                 println!("Successfully retrieved and verified metrics!");
@@ -143,7 +143,7 @@ async fn test_backend_with_metrics() -> Result<()> {
 }
 
 fn verify_ingress_metrics_greater_than_0(metrics_content: &str) {
-    // Define the work handler metrics we want to verify (excluding concurrent_requests which can be 0)
+    // Define the work handler metrics we want to verify (excluding inflight_requests which can be 0)
     let metrics_to_verify = [
         "my_custom_bytes_processed_total",
         "requests_total",
diff --git a/lib/runtime/src/metrics.rs b/lib/runtime/src/metrics.rs
index 8d850a4b76..ff0d2a1d97 100644
--- a/lib/runtime/src/metrics.rs
+++ b/lib/runtime/src/metrics.rs
@@ -1624,11 +1624,7 @@ mod test_metricsregistry_nats {
                 18000.0,
                 23000.0,
             ), // ~75-125% of 20660
-            (
-                build_metric_name(work_handler::CONCURRENT_REQUESTS),
-                0.0,
-                1.0,
-            ), // 0 or very low
+            (build_metric_name(work_handler::INFLIGHT_REQUESTS), 0.0, 1.0), // 0 or very low
             // Histograms have _{count,sum} suffixes
             (
                 format!(
diff --git a/lib/runtime/src/metrics/prometheus_names.rs b/lib/runtime/src/metrics/prometheus_names.rs
index 5fd5af78da..4b91ec7edc 100644
--- a/lib/runtime/src/metrics/prometheus_names.rs
+++ b/lib/runtime/src/metrics/prometheus_names.rs
@@ -109,7 +109,7 @@ pub mod work_handler {
     pub const RESPONSE_BYTES_TOTAL: &str = "response_bytes_total";
 
     /// Number of requests currently being processed by work handler
-    pub const CONCURRENT_REQUESTS: &str = "concurrent_requests";
+    pub const INFLIGHT_REQUESTS: &str = "inflight_requests";
 
     /// Time spent processing requests by work handler (histogram)
     pub const REQUEST_DURATION_SECONDS: &str = "request_duration_seconds";
diff --git a/lib/runtime/src/pipeline/network/ingress/push_handler.rs b/lib/runtime/src/pipeline/network/ingress/push_handler.rs
index 7250070208..8594e2a2cd 100644
--- a/lib/runtime/src/pipeline/network/ingress/push_handler.rs
+++ b/lib/runtime/src/pipeline/network/ingress/push_handler.rs
@@ -26,7 +26,7 @@ use tracing::Instrument;
 pub struct WorkHandlerMetrics {
     pub request_counter: IntCounter,
     pub request_duration: Histogram,
-    pub concurrent_requests: IntGauge,
+    pub inflight_requests: IntGauge,
     pub request_bytes: IntCounter,
     pub response_bytes: IntCounter,
     pub error_counter: IntCounterVec,
@@ -36,7 +36,7 @@ impl WorkHandlerMetrics {
     pub fn new(
         request_counter: IntCounter,
         request_duration: Histogram,
-        concurrent_requests: IntGauge,
+        inflight_requests: IntGauge,
         request_bytes: IntCounter,
         response_bytes: IntCounter,
         error_counter: IntCounterVec,
@@ -44,7 +44,7 @@ impl WorkHandlerMetrics {
         Self {
             request_counter,
             request_duration,
-            concurrent_requests,
+            inflight_requests,
             request_bytes,
             response_bytes,
             error_counter,
@@ -68,8 +68,8 @@ impl WorkHandlerMetrics {
             None,
         )?;
 
-        let concurrent_requests = endpoint.create_intgauge(
-            "concurrent_requests",
+        let inflight_requests = endpoint.create_intgauge(
+            "inflight_requests",
             "Number of requests currently being processed by work handler",
             &[],
         )?;
@@ -96,7 +96,7 @@ impl WorkHandlerMetrics {
         Ok(Self::new(
             request_counter,
             request_duration,
-            concurrent_requests,
+            inflight_requests,
             request_bytes,
             response_bytes,
             error_counter,
@@ -121,7 +121,7 @@ where
 
         if let Some(m) = self.metrics() {
             m.request_counter.inc();
-            m.concurrent_requests.inc();
+            m.inflight_requests.inc();
             m.request_bytes.inc_by(payload.len() as u64);
         }
 
@@ -276,7 +276,7 @@ where
         if let Some(m) = self.metrics() {
             let duration = start_time.elapsed();
             m.request_duration.observe(duration.as_secs_f64());
-            m.concurrent_requests.dec();
+            m.inflight_requests.dec();
         }
 
         Ok(())
diff --git a/tests/router/test_router_e2e_with_mockers.py b/tests/router/test_router_e2e_with_mockers.py
index 92e5b4d6c0..8fa16ca75e 100644
--- a/tests/router/test_router_e2e_with_mockers.py
+++ b/tests/router/test_router_e2e_with_mockers.py
@@ -143,7 +143,7 @@ def test_mocker_kv_router(request, runtime_services):
 
         # Use async to send requests concurrently for better performance
         asyncio.run(
-            send_concurrent_requests(
+            send_inflight_requests(
                 [
                     f"http://localhost:{frontend_port}/v1/chat/completions"
                 ],  # Pass as list
@@ -229,7 +229,7 @@ def test_mocker_two_kv_router(request, runtime_services):
 
         # Use async to send requests concurrently, alternating between routers
         asyncio.run(
-            send_concurrent_requests(
+            send_inflight_requests(
                 router_urls,
                 test_payload,
                 NUM_REQUESTS,
@@ -281,7 +281,7 @@ async def send_request_with_retry(url: str, payload: dict, max_retries: int = 4)
     return False
 
 
-async def send_concurrent_requests(urls: list, payload: dict, num_requests: int):
+async def send_inflight_requests(urls: list, payload: dict, num_requests: int):
     """Send multiple requests concurrently, alternating between URLs if multiple provided"""
 
     # First, send test requests with retry to ensure all systems are ready

From 5a8c45ca80aebe7fb51cc27d652ba834a8be7802 Mon Sep 17 00:00:00 2001
From: "tzulingk@nvidia.com" <tzulingk@nvidia.com>
Date: Tue, 19 Aug 2025 14:52:15 -0700
Subject: [PATCH 3/4] Revert the incorrect change of the auto merge.

---
 tests/router/test_router_e2e_with_mockers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/router/test_router_e2e_with_mockers.py b/tests/router/test_router_e2e_with_mockers.py
index 7fd799d572..e457ad00e6 100644
--- a/tests/router/test_router_e2e_with_mockers.py
+++ b/tests/router/test_router_e2e_with_mockers.py
@@ -131,7 +131,7 @@ async def send_request_with_retry(url: str, payload: dict, max_retries: int = 4)
     return False
 
 
-async def send_concurrent_requests(urls: list, payload: dict, num_requests: int):
+async def send_inflight_requests(urls: list, payload: dict, num_requests: int):
     """Send multiple requests concurrently, alternating between URLs if multiple provided"""
 
     # First, send test requests with retry to ensure all systems are ready

From 28bff23d0c43775c97e2d2785f93dadd3284163c Mon Sep 17 00:00:00 2001
From: "tzulingk@nvidia.com" <tzulingk@nvidia.com>
Date: Tue, 19 Aug 2025 14:56:31 -0700
Subject: [PATCH 4/4] Merging main is introducing another usage of
 send_concurrent_requests.

---
 tests/router/test_router_e2e_with_mockers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/router/test_router_e2e_with_mockers.py b/tests/router/test_router_e2e_with_mockers.py
index e457ad00e6..843428fd11 100644
--- a/tests/router/test_router_e2e_with_mockers.py
+++ b/tests/router/test_router_e2e_with_mockers.py
@@ -404,7 +404,7 @@ def test_mocker_kv_router_overload_503(request, runtime_services):
 
         # First, send one request with retry to ensure system is ready
         logger.info("Sending initial request to ensure system is ready...")
-        asyncio.run(send_concurrent_requests([url], test_payload_503, 1))
+        asyncio.run(send_inflight_requests([url], test_payload_503, 1))
 
         # Now send 50 concurrent requests to exhaust resources, then verify 503
         logger.info("Sending 50 concurrent requests to exhaust resources...")