From 49cf51235874175d3901c15f3b86cb328f8d473e Mon Sep 17 00:00:00 2001 From: "tzulingk@nvidia.com" Date: Mon, 18 Aug 2025 16:53:54 -0700 Subject: [PATCH 1/4] Correct the metric name and labels. --- .../grafana_dashboards/grafana-dynamo-dashboard.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json b/deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json index 31a2e9058d..268734a6da 100644 --- a/deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json +++ b/deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json @@ -695,7 +695,7 @@ "targets": [ { "editorMode": "code", - "expr": "rate(dynamo_response_bytes_total{endpoint=\"generate\"}[1m])", + "expr": "rate(dynamo_component_response_bytes_total{dynamo_endpoint=\"generate\"}[1m])", "legendFormat": "Response bytes", "range": true, "refId": "A" @@ -706,7 +706,7 @@ "uid": "P1809F7CD0C75ACF3" }, "editorMode": "code", - "expr": "rate(dynamo_request_bytes_total{endpoint=\"generate\"}[1m])", + "expr": "rate(dynamo_component_request_bytes_total{dynamo_endpoint=\"generate\"}[1m])", "hide": false, "instant": false, "legendFormat": "Request bytes", @@ -800,7 +800,7 @@ "targets": [ { "editorMode": "code", - "expr": "rate(dynamo_requests_total{endpoint=\"generate\"}[1m])", + "expr": "rate(dynamo_component_requests_total{dynamo_endpoint=\"generate\"}[1m])", "legendFormat": "__auto", "range": true, "refId": "A" @@ -905,7 +905,7 @@ "targets": [ { "editorMode": "code", - "expr": "dynamo_request_duration_seconds_sum / dynamo_request_duration_seconds_count", + "expr": "dynamo_component_request_duration_seconds_sum / dynamo_component_request_duration_seconds_count", "legendFormat": "__auto", "range": true, "refId": "A" @@ -997,7 +997,7 @@ "targets": [ { "editorMode": "code", - "expr": "rate(dynamo_errors_total{endpoint=\"generate\"}[1m])", + "expr": "rate(dynamo_component_errors_total{dynamendpoint=\"generate\"}[1m])", "legendFormat": "{{error_type}}", "range": true, "refId": "A" From bc615eb7c4e5ff14ee9ff80661d8a7e079bde155 Mon Sep 17 00:00:00 2001 From: "tzulingk@nvidia.com" Date: Mon, 18 Aug 2025 20:34:40 -0700 Subject: [PATCH 2/4] Rename dynamo_component_concurrent_requests to dynamo_component_inflight_requests so that it matches the correspond frontend metric. --- deploy/metrics/README.md | 2 +- lib/runtime/examples/system_metrics/README.md | 8 ++++---- .../system_metrics/tests/integration_test.rs | 4 ++-- lib/runtime/src/metrics.rs | 6 +----- lib/runtime/src/metrics/prometheus_names.rs | 2 +- .../src/pipeline/network/ingress/push_handler.rs | 16 ++++++++-------- tests/router/test_router_e2e_with_mockers.py | 6 +++--- 7 files changed, 20 insertions(+), 24 deletions(-) diff --git a/deploy/metrics/README.md b/deploy/metrics/README.md index 42688ea2d7..054981aedd 100644 --- a/deploy/metrics/README.md +++ b/deploy/metrics/README.md @@ -40,7 +40,7 @@ As of Q2 2025, Dynamo HTTP Frontend metrics are exposed when you build container The core Dynamo backend system automatically exposes metrics with the `dynamo_component_*` prefix for all components that use the `DistributedRuntime` framework: -- `dynamo_component_concurrent_requests`: Requests currently being processed (gauge) +- `dynamo_component_inflight_requests`: Requests currently being processed (gauge) - `dynamo_component_request_bytes_total`: Total bytes received in requests (counter) - `dynamo_component_request_duration_seconds`: Request processing time (histogram) - `dynamo_component_requests_total`: Total requests processed (counter) diff --git a/lib/runtime/examples/system_metrics/README.md b/lib/runtime/examples/system_metrics/README.md index 2e1d320902..6ab654da41 100644 --- a/lib/runtime/examples/system_metrics/README.md +++ b/lib/runtime/examples/system_metrics/README.md @@ -62,7 +62,7 @@ The `dynamo_component_errors_total` metric includes the following error types: - `dynamo_component_request_duration_seconds` - Request processing time ### Gauges -- `dynamo_component_concurrent_requests` - Number of requests currently being processed +- `dynamo_component_inflight_requests` - Number of requests currently being processed ### Custom Metrics (Optional) - `dynamo_component_bytes_processed_total` - Total data bytes processed by system handler (example) @@ -80,9 +80,9 @@ These labels are prefixed with "dynamo_" to avoid collisions with Kubernetes and When the system is running, you'll see metrics from http://:/metrics like this: ```prometheus -# HELP dynamo_component_concurrent_requests Number of requests currently being processed by component endpoint -# TYPE dynamo_component_concurrent_requests gauge -dynamo_component_concurrent_requests{dynamo_component="example_component",dynamo_endpoint="example_endpoint9881",dynamo_namespace="example_namespace"} 0 +# HELP dynamo_component_inflight_requests Number of requests currently being processed by component endpoint +# TYPE dynamo_component_inflight_requests gauge +dynamo_component_inflight_requests{dynamo_component="example_component",dynamo_endpoint="example_endpoint9881",dynamo_namespace="example_namespace"} 0 # HELP dynamo_component_bytes_processed_total Example of a custom metric. Total number of data bytes processed by system handler # TYPE dynamo_component_bytes_processed_total counter diff --git a/lib/runtime/examples/system_metrics/tests/integration_test.rs b/lib/runtime/examples/system_metrics/tests/integration_test.rs index 030efd6477..d158dffbc9 100644 --- a/lib/runtime/examples/system_metrics/tests/integration_test.rs +++ b/lib/runtime/examples/system_metrics/tests/integration_test.rs @@ -118,7 +118,7 @@ async fn test_backend_with_metrics() -> Result<()> { println!("{}", metrics_content); println!("=== END METRICS CONTENT ==="); - // Parse and verify ingress metrics are greater than 0 (except concurrent_requests) + // Parse and verify ingress metrics are greater than 0 (except inflight_requests) verify_ingress_metrics_greater_than_0(&metrics_content); println!("Successfully retrieved and verified metrics!"); @@ -143,7 +143,7 @@ async fn test_backend_with_metrics() -> Result<()> { } fn verify_ingress_metrics_greater_than_0(metrics_content: &str) { - // Define the work handler metrics we want to verify (excluding concurrent_requests which can be 0) + // Define the work handler metrics we want to verify (excluding inflight_requests which can be 0) let metrics_to_verify = [ "my_custom_bytes_processed_total", "requests_total", diff --git a/lib/runtime/src/metrics.rs b/lib/runtime/src/metrics.rs index 8d850a4b76..ff0d2a1d97 100644 --- a/lib/runtime/src/metrics.rs +++ b/lib/runtime/src/metrics.rs @@ -1624,11 +1624,7 @@ mod test_metricsregistry_nats { 18000.0, 23000.0, ), // ~75-125% of 20660 - ( - build_metric_name(work_handler::CONCURRENT_REQUESTS), - 0.0, - 1.0, - ), // 0 or very low + (build_metric_name(work_handler::INFLIGHT_REQUESTS), 0.0, 1.0), // 0 or very low // Histograms have _{count,sum} suffixes ( format!( diff --git a/lib/runtime/src/metrics/prometheus_names.rs b/lib/runtime/src/metrics/prometheus_names.rs index 5fd5af78da..4b91ec7edc 100644 --- a/lib/runtime/src/metrics/prometheus_names.rs +++ b/lib/runtime/src/metrics/prometheus_names.rs @@ -109,7 +109,7 @@ pub mod work_handler { pub const RESPONSE_BYTES_TOTAL: &str = "response_bytes_total"; /// Number of requests currently being processed by work handler - pub const CONCURRENT_REQUESTS: &str = "concurrent_requests"; + pub const INFLIGHT_REQUESTS: &str = "inflight_requests"; /// Time spent processing requests by work handler (histogram) pub const REQUEST_DURATION_SECONDS: &str = "request_duration_seconds"; diff --git a/lib/runtime/src/pipeline/network/ingress/push_handler.rs b/lib/runtime/src/pipeline/network/ingress/push_handler.rs index 7250070208..8594e2a2cd 100644 --- a/lib/runtime/src/pipeline/network/ingress/push_handler.rs +++ b/lib/runtime/src/pipeline/network/ingress/push_handler.rs @@ -26,7 +26,7 @@ use tracing::Instrument; pub struct WorkHandlerMetrics { pub request_counter: IntCounter, pub request_duration: Histogram, - pub concurrent_requests: IntGauge, + pub inflight_requests: IntGauge, pub request_bytes: IntCounter, pub response_bytes: IntCounter, pub error_counter: IntCounterVec, @@ -36,7 +36,7 @@ impl WorkHandlerMetrics { pub fn new( request_counter: IntCounter, request_duration: Histogram, - concurrent_requests: IntGauge, + inflight_requests: IntGauge, request_bytes: IntCounter, response_bytes: IntCounter, error_counter: IntCounterVec, @@ -44,7 +44,7 @@ impl WorkHandlerMetrics { Self { request_counter, request_duration, - concurrent_requests, + inflight_requests, request_bytes, response_bytes, error_counter, @@ -68,8 +68,8 @@ impl WorkHandlerMetrics { None, )?; - let concurrent_requests = endpoint.create_intgauge( - "concurrent_requests", + let inflight_requests = endpoint.create_intgauge( + "inflight_requests", "Number of requests currently being processed by work handler", &[], )?; @@ -96,7 +96,7 @@ impl WorkHandlerMetrics { Ok(Self::new( request_counter, request_duration, - concurrent_requests, + inflight_requests, request_bytes, response_bytes, error_counter, @@ -121,7 +121,7 @@ where if let Some(m) = self.metrics() { m.request_counter.inc(); - m.concurrent_requests.inc(); + m.inflight_requests.inc(); m.request_bytes.inc_by(payload.len() as u64); } @@ -276,7 +276,7 @@ where if let Some(m) = self.metrics() { let duration = start_time.elapsed(); m.request_duration.observe(duration.as_secs_f64()); - m.concurrent_requests.dec(); + m.inflight_requests.dec(); } Ok(()) diff --git a/tests/router/test_router_e2e_with_mockers.py b/tests/router/test_router_e2e_with_mockers.py index 92e5b4d6c0..8fa16ca75e 100644 --- a/tests/router/test_router_e2e_with_mockers.py +++ b/tests/router/test_router_e2e_with_mockers.py @@ -143,7 +143,7 @@ def test_mocker_kv_router(request, runtime_services): # Use async to send requests concurrently for better performance asyncio.run( - send_concurrent_requests( + send_inflight_requests( [ f"http://localhost:{frontend_port}/v1/chat/completions" ], # Pass as list @@ -229,7 +229,7 @@ def test_mocker_two_kv_router(request, runtime_services): # Use async to send requests concurrently, alternating between routers asyncio.run( - send_concurrent_requests( + send_inflight_requests( router_urls, test_payload, NUM_REQUESTS, @@ -281,7 +281,7 @@ async def send_request_with_retry(url: str, payload: dict, max_retries: int = 4) return False -async def send_concurrent_requests(urls: list, payload: dict, num_requests: int): +async def send_inflight_requests(urls: list, payload: dict, num_requests: int): """Send multiple requests concurrently, alternating between URLs if multiple provided""" # First, send test requests with retry to ensure all systems are ready From 5a8c45ca80aebe7fb51cc27d652ba834a8be7802 Mon Sep 17 00:00:00 2001 From: "tzulingk@nvidia.com" Date: Tue, 19 Aug 2025 14:52:15 -0700 Subject: [PATCH 3/4] Revert the incorrect change of the auto merge. --- tests/router/test_router_e2e_with_mockers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/router/test_router_e2e_with_mockers.py b/tests/router/test_router_e2e_with_mockers.py index 7fd799d572..e457ad00e6 100644 --- a/tests/router/test_router_e2e_with_mockers.py +++ b/tests/router/test_router_e2e_with_mockers.py @@ -131,7 +131,7 @@ async def send_request_with_retry(url: str, payload: dict, max_retries: int = 4) return False -async def send_concurrent_requests(urls: list, payload: dict, num_requests: int): +async def send_inflight_requests(urls: list, payload: dict, num_requests: int): """Send multiple requests concurrently, alternating between URLs if multiple provided""" # First, send test requests with retry to ensure all systems are ready From 28bff23d0c43775c97e2d2785f93dadd3284163c Mon Sep 17 00:00:00 2001 From: "tzulingk@nvidia.com" Date: Tue, 19 Aug 2025 14:56:31 -0700 Subject: [PATCH 4/4] Merging main is introducing another usage of send_concurrent_requests. --- tests/router/test_router_e2e_with_mockers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/router/test_router_e2e_with_mockers.py b/tests/router/test_router_e2e_with_mockers.py index e457ad00e6..843428fd11 100644 --- a/tests/router/test_router_e2e_with_mockers.py +++ b/tests/router/test_router_e2e_with_mockers.py @@ -404,7 +404,7 @@ def test_mocker_kv_router_overload_503(request, runtime_services): # First, send one request with retry to ensure system is ready logger.info("Sending initial request to ensure system is ready...") - asyncio.run(send_concurrent_requests([url], test_payload_503, 1)) + asyncio.run(send_inflight_requests([url], test_payload_503, 1)) # Now send 50 concurrent requests to exhaust resources, then verify 503 logger.info("Sending 50 concurrent requests to exhaust resources...")