refactor: make Prometheus metric naming conventions more consistent

keivenchang · keivenchang · commit 7a707c770b33 · 2025-09-19T18:04:40.000Z
- Rename connections_total to current_connections (gauge for active connections)
- Rename client_disconnects_total to disconnected_clients_total (better ordering)
- Rename PROCESSING_TIME_MS_TOTAL to PROCESSING_MS_TOTAL (more concise)
- Apply unit_aggregation pattern: AVG_PROCESSING_MS -&gt; PROCESSING_MS_AVG
- Sync ComponentNatsServerPrometheusMetrics variable names with metric constants
- Update documentation with comprehensive naming transformation rules
- Add units _messages and _connections to naming conventions
- Update all code references, documentation, and test comments consistently

These changes follow Prometheus best practices by distinguishing gauge vs
counter metrics and using consistent {unit}_{aggregation} naming patterns.

Signed-off-by: Keiven Chang &lt;keivenchang@users.noreply.github.com&gt;
diff --git a/deploy/metrics/README.md b/deploy/metrics/README.md
@@ -70,7 +70,7 @@ Some components expose additional metrics specific to their functionality:
 
 When using Dynamo HTTP Frontend (`--framework VLLM` or `--framework TRTLLM`), these metrics are automatically exposed with the `dynamo_frontend_*` prefix and include `model` labels containing the model name:
 
-- `dynamo_frontend_inflight_requests_total`: Inflight requests (gauge)
+- `dynamo_frontend_inflight_requests`: Inflight requests (gauge)
 - `dynamo_frontend_queued_requests_total`: Number of requests in HTTP processing queue (gauge)
 - `dynamo_frontend_input_sequence_tokens`: Input sequence length (histogram)
 - `dynamo_frontend_inter_token_latency_seconds`: Inter-token latency (histogram)
@@ -79,7 +79,7 @@ When using Dynamo HTTP Frontend (`--framework VLLM` or `--framework TRTLLM`), th
 - `dynamo_frontend_requests_total`: Total LLM requests (counter)
 - `dynamo_frontend_time_to_first_token_seconds`: Time to first token (histogram)
 
-**Note**: The `dynamo_frontend_inflight_requests_total` metric tracks requests from HTTP handler start until the complete response is finished, while `dynamo_frontend_queued_requests_total` tracks requests from HTTP handler start until first token generation begins (including prefill time). HTTP queue time is a subset of inflight time.
+**Note**: The `dynamo_frontend_inflight_requests` metric tracks requests from HTTP handler start until the complete response is finished, while `dynamo_frontend_queued_requests_total` tracks requests from HTTP handler start until first token generation begins (including prefill time). HTTP queue time is a subset of inflight time.
 
 #### Request Processing Flow
 
@@ -125,10 +125,10 @@ Try launching a frontend and a Mocker backend that allows 3 concurrent requests:
 $ python -m dynamo.frontend --http-port 8000
 $ python -m dynamo.mocker --model-path Qwen/Qwen3-0.6B --max-num-seqs 3
 # Launch your 10 concurrent clients here
-# Then check the queued_requests_total and inflight_requests_total metrics from the frontend:
+# Then check the queued_requests_total and inflight_requests metrics from the frontend:
 $ curl -s localhost:8000/metrics|grep -v '^#'|grep -E 'queue|inflight'
 dynamo_frontend_queued_requests_total{model="qwen/qwen3-0.6b"} 7
-dynamo_frontend_inflight_requests_total{model="qwen/qwen3-0.6b"} 10
+dynamo_frontend_inflight_requests{model="qwen/qwen3-0.6b"} 10
 ```
 
 **Real setup using vLLM (instead of Mocker):**
@@ -271,8 +271,8 @@ let component = namespace.component("my_component")?;
 let endpoint = component.endpoint("my_endpoint")?;
 
 // Create endpoint-level counters (this is a Prometheus Counter type)
-let total_requests = endpoint.create_counter(
-    "total_requests",
+let requests_total = endpoint.create_counter(
+    "requests_total",
     "Total requests across all namespaces",
     &[]
 )?;
@@ -449,8 +449,8 @@ let latency = endpoint.create_histogram(
 
 ```rust
 // Aggregate metrics across multiple endpoints
-let total_requests = namespace.create_counter(
-    "total_requests",
+let requests_total = namespace.create_counter(
+    "requests_total",
     "Total requests across all endpoints",
     &[]
 )?;
diff --git a/lib/llm/src/http/service/metrics.rs b/lib/llm/src/http/service/metrics.rs
@@ -117,7 +117,8 @@ impl Metrics {
     ///
     /// The following metrics will be created with the configured prefix:
     /// - `{prefix}_requests_total` - IntCounterVec for the total number of requests processed
-    /// - `{prefix}_inflight_requests` - IntGaugeVec for the number of inflight requests
+    /// - `{prefix}_inflight_requests` - IntGaugeVec for the number of inflight/concurrent requests
+    /// - `{prefix}_disconnected_clients_total` - IntGauge for the number of connections dropped by clients
     /// - `{prefix}_request_duration_seconds` - HistogramVec for the duration of requests
     /// - `{prefix}_input_sequence_tokens` - HistogramVec for input sequence length in tokens
     /// - `{prefix}_output_sequence_tokens` - HistogramVec for output sequence length in tokens
@@ -148,15 +149,15 @@ impl Metrics {
 
         let inflight_gauge = IntGaugeVec::new(
             Opts::new(
-                frontend_metric_name(frontend_service::INFLIGHT_REQUESTS_TOTAL),
+                frontend_metric_name(frontend_service::INFLIGHT_REQUESTS),
                 "Number of inflight requests",
             ),
             &["model"],
         )
         .unwrap();
 
         let client_disconnect_gauge = prometheus::IntGauge::new(
-            frontend_metric_name("client_disconnects"),
+                frontend_metric_name(frontend_service::DISCONNECTED_CLIENTS_TOTAL),
             "Number of connections dropped by clients",
         )
         .unwrap();
diff --git a/lib/runtime/examples/system_metrics/tests/integration_test.rs b/lib/runtime/examples/system_metrics/tests/integration_test.rs
@@ -106,7 +106,7 @@ async fn test_backend_with_metrics() -> Result<()> {
                 println!("{}", metrics_content);
                 println!("=== END METRICS CONTENT ===");
 
-                // Parse and verify ingress metrics are greater than 0 (except inflight_requests)
+                // Parse and verify ingress metrics are greater than 0 (except inflight_requests_total)
                 verify_ingress_metrics_greater_than_0(&metrics_content);
 
                 println!("Successfully retrieved and verified metrics!");
@@ -131,7 +131,7 @@ async fn test_backend_with_metrics() -> Result<()> {
 }
 
 fn verify_ingress_metrics_greater_than_0(metrics_content: &str) {
-    // Define the work handler metrics we want to verify (excluding inflight_requests which can be 0)
+    // Define the work handler metrics we want to verify (excluding inflight_requests_total which can be 0)
     let metrics_to_verify = [
         "my_custom_bytes_processed_total",
         "requests_total",
diff --git a/lib/runtime/src/metrics.rs b/lib/runtime/src/metrics.rs
@@ -1176,8 +1176,8 @@ dynamo_component_nats_client_connection_state 1
 # TYPE dynamo_component_latency histogram
 dynamo_component_latency_bucket{le="0.1"} 10
 dynamo_component_latency_bucket{le="0.5"} 25
-dynamo_component_nats_service_total_requests 100
-dynamo_component_nats_service_total_errors 5"#;
+dynamo_component_nats_service_requests_total 100
+dynamo_component_nats_service_errors_total 5"#;
 
         // Test remove_nats_lines (excludes NATS lines but keeps help/type)
         let filtered_out = super::test_helpers::remove_nats_lines(test_input);
@@ -1421,7 +1421,7 @@ mod test_metricsregistry_nats {
                 1.0,
                 1.0,
             ), // Should be connected
-            (build_component_metric_name(nats_client::CONNECTS), 1.0, 1.0), // Should have 1 connection
+            (build_component_metric_name(nats_client::CURRENT_CONNECTIONS), 1.0, 1.0), // Should have 1 connection
             (
                 build_component_metric_name(nats_client::IN_TOTAL_BYTES),
                 800.0,
@@ -1449,17 +1449,17 @@ mod test_metricsregistry_nats {
                 0.0,
             ), // No processing yet
             (
-                build_component_metric_name(nats_service::TOTAL_ERRORS),
+                build_component_metric_name(nats_service::ERRORS_TOTAL),
                 0.0,
                 0.0,
             ), // No errors yet
             (
-                build_component_metric_name(nats_service::TOTAL_REQUESTS),
+                build_component_metric_name(nats_service::REQUESTS_TOTAL),
                 0.0,
                 0.0,
             ), // No requests yet
             (
-                build_component_metric_name(nats_service::TOTAL_PROCESSING_MS),
+                build_component_metric_name(nats_service::PROCESSING_MS_TOTAL),
                 0.0,
                 0.0,
             ), // No processing yet
@@ -1550,7 +1550,7 @@ mod test_metricsregistry_nats {
                 1.0,
                 1.0,
             ), // Connected
-            (build_component_metric_name(nats_client::CONNECTS), 1.0, 1.0), // 1 connection
+            (build_component_metric_name(nats_client::CURRENT_CONNECTIONS), 1.0, 1.0), // 1 connection
             (
                 build_component_metric_name(nats_client::IN_TOTAL_BYTES),
                 20000.0,
@@ -1578,17 +1578,17 @@ mod test_metricsregistry_nats {
                 1.0,
             ), // Low processing time
             (
-                build_component_metric_name(nats_service::TOTAL_ERRORS),
+                build_component_metric_name(nats_service::ERRORS_TOTAL),
                 0.0,
                 0.0,
             ), // No errors
             (
-                build_component_metric_name(nats_service::TOTAL_REQUESTS),
+                build_component_metric_name(nats_service::REQUESTS_TOTAL),
                 0.0,
                 0.0,
             ), // No work handler requests
             (
-                build_component_metric_name(nats_service::TOTAL_PROCESSING_MS),
+                build_component_metric_name(nats_service::PROCESSING_MS_TOTAL),
                 0.0,
                 5.0,
             ), // Low total processing time
diff --git a/lib/runtime/src/metrics/prometheus_names.rs b/lib/runtime/src/metrics/prometheus_names.rs
@@ -13,26 +13,37 @@
 //! **Prefix**: Component identifier (`dynamo_component_`, `dynamo_frontend_`, etc.)
 //! **Name**: Descriptive snake_case name indicating what is measured
 //! **Suffix**:
-//!   - Units: `_seconds`, `_bytes`, `_ms`, `_percent`
+//!   - Units: `_seconds`, `_bytes`, `_ms`, `_percent`, `_messages`, `_connections`
 //!   - Counters: `_total` (not `total_` prefix)
 //!   - Note: Do not use `_counter`, `_gauge`, `_time`, or `_size` in Prometheus names (too vague)
 //!
 //! **Common Transformations**:
 //! - ❌ `_counter` → ✅ `_total`
+//! - ❌ `_sum` → ✅ `_total`
+//! - ❌ `_gauge` → ✅ (no suffix needed for current values)
 //! - ❌ `_time` → ✅ `_seconds`, `_ms`, `_hours`, `_duration_seconds`
+//! - ❌ `_time_total` → ✅ `_seconds_total`, `_ms_total`, `_hours_total`
+//! - ❌ `_total_time` → ✅ `_seconds_total`, `_ms_total`, `_hours_total`
+//! - ❌ `_total_time_seconds` → ✅ `_seconds_total`
+//! - ❌ `_average_time` → ✅ `_seconds_avg`, `_ms_avg`
 //! - ❌ `_size` → ✅ `_bytes`, `_total`, `_length`
-//! - ❌ `_gauge` → ✅ (no suffix needed for current values)
+//! - ❌ `_some_request_size` → ✅ `_some_request_bytes_avg`
 //! - ❌ `_rate` → ✅ `_per_second`, `_per_minute`
+//! - ❌ `client_disconnects_total` → ✅ `disconnected_clients_total`
+//! - ❌ `inflight_requests_total` → ✅ `inflight_requests` (gauge, not counter)
+//! - ❌ `connections_total` → ✅ `current_connections` (gauge, not counter)
 //!
 //! **Examples**:
 //! - ✅ `dynamo_frontend_requests_total` - Total request counter (not `incoming_requests`)
 //! - ✅ `dynamo_frontend_request_duration_seconds` - Request duration histogram (not `response_time`)
 //! - ✅ `dynamo_component_errors_total` - Total error counter (not `total_errors`)
 //! - ✅ `dynamo_component_memory_usage_bytes` - Memory usage gauge
-//! - ✅ `dynamo_frontend_inflight_requests_total` - Current inflight requests gauge
+//! - ✅ `dynamo_frontend_inflight_requests` - Current inflight requests gauge
 //! - ✅ `nats_client_connection_duration_ms` - Connection time in milliseconds
 //! - ✅ `dynamo_component_cpu_usage_percent` - CPU usage percentage
 //! - ✅ `dynamo_frontend_tokens_per_second` - Token generation rate
+//! - ✅ `nats_client_current_connections` - Current active connections gauge
+//! - ✅ `nats_client_in_messages` - Total messages received counter
 //!
 //! ## Key Differences: Prometheus Metric Names vs Prometheus Label Names
 //!
@@ -76,8 +87,12 @@ pub mod frontend_service {
     /// Number of requests waiting in HTTP queue before receiving the first response.
     pub const QUEUED_REQUESTS_TOTAL: &str = "queued_requests_total";
 
-    /// Number of inflight requests going to the engine (vLLM, SGLang, ...)
-    pub const INFLIGHT_REQUESTS_TOTAL: &str = "inflight_requests_total";
+    /// Number of inflight/concurrent requests going to the engine (vLLM, SGLang, ...)
+    /// Note: Gauge metric measuring current requests, not cumulative total
+    pub const INFLIGHT_REQUESTS: &str = "inflight_requests";
+
+    /// Number of connections dropped by clients
+    pub const DISCONNECTED_CLIENTS_TOTAL: &str = "disconnected_clients_total";
 
     /// Duration of LLM requests
     pub const REQUEST_DURATION_SECONDS: &str = "request_duration_seconds";
@@ -124,7 +139,8 @@ pub mod work_handler {
     /// Total number of bytes sent in responses by work handler
     pub const RESPONSE_BYTES_TOTAL: &str = "response_bytes_total";
 
-    /// Number of requests currently being processed by work handler
+    /// Number of inflight/concurrent requests currently being processed by work handler
+    /// Note: Gauge metric measuring current requests, not cumulative total
     pub const INFLIGHT_REQUESTS: &str = "inflight_requests";
 
     /// Time spent processing requests by work handler (histogram)
@@ -182,8 +198,9 @@ pub mod nats_client {
     /// Total number of messages sent by NATS client
     pub const OUT_MESSAGES: &str = nats_client_name!("out_messages");
 
-    /// Total number of connections established by NATS client
-    pub const CONNECTS: &str = nats_client_name!("connects");
+    /// Current number of active connections for NATS client
+    /// Note: Gauge metric measuring current connections, not cumulative total
+    pub const CURRENT_CONNECTIONS: &str = nats_client_name!("current_connections");
 
     /// Current connection state of NATS client (0=disconnected, 1=connected, 2=reconnecting)
     pub const CONNECTION_STATE: &str = nats_client_name!("connection_state");
@@ -202,16 +219,16 @@ pub mod nats_service {
     pub const PREFIX: &str = nats_service_name!("");
 
     /// Average processing time in milliseconds (maps to: average_processing_time in ms)
-    pub const AVG_PROCESSING_MS: &str = nats_service_name!("avg_processing_time_ms");
+    pub const PROCESSING_MS_AVG: &str = nats_service_name!("processing_ms_avg");
 
     /// Total errors across all endpoints (maps to: num_errors)
-    pub const TOTAL_ERRORS: &str = nats_service_name!("total_errors");
+    pub const ERRORS_TOTAL: &str = nats_service_name!("errors_total");
 
     /// Total requests across all endpoints (maps to: num_requests)
-    pub const TOTAL_REQUESTS: &str = nats_service_name!("total_requests");
+    pub const REQUESTS_TOTAL: &str = nats_service_name!("requests_total");
 
     /// Total processing time in milliseconds (maps to: processing_time in ms)
-    pub const TOTAL_PROCESSING_MS: &str = nats_service_name!("total_processing_time_ms");
+    pub const PROCESSING_MS_TOTAL: &str = nats_service_name!("processing_ms_total");
 
     /// Number of active services (derived from ServiceSet.services)
     pub const ACTIVE_SERVICES: &str = nats_service_name!("active_services");
@@ -223,7 +240,7 @@ pub mod nats_service {
 /// All NATS client Prometheus metric names as an array for iteration/validation
 pub const DRT_NATS_METRICS: &[&str] = &[
     nats_client::CONNECTION_STATE,
-    nats_client::CONNECTS,
+    nats_client::CURRENT_CONNECTIONS,
     nats_client::IN_TOTAL_BYTES,
     nats_client::IN_MESSAGES,
     nats_client::OUT_OVERHEAD_BYTES,
@@ -233,10 +250,10 @@ pub const DRT_NATS_METRICS: &[&str] = &[
 /// All component service Prometheus metric names as an array for iteration/validation
 /// (ordered to match NatsStatsMetrics fields)
 pub const COMPONENT_NATS_METRICS: &[&str] = &[
-    nats_service::AVG_PROCESSING_MS, // maps to: average_processing_time (nanoseconds)
-    nats_service::TOTAL_ERRORS,      // maps to: num_errors
-    nats_service::TOTAL_REQUESTS,    // maps to: num_requests
-    nats_service::TOTAL_PROCESSING_MS, // maps to: processing_time (nanoseconds)
+    nats_service::PROCESSING_MS_AVG, // maps to: average_processing_time (nanoseconds)
+    nats_service::ERRORS_TOTAL,      // maps to: num_errors
+    nats_service::REQUESTS_TOTAL,    // maps to: num_requests
+    nats_service::PROCESSING_MS_TOTAL, // maps to: processing_time (nanoseconds)
     nats_service::ACTIVE_SERVICES,   // derived from ServiceSet.services
     nats_service::ACTIVE_ENDPOINTS,  // derived from ServiceInfo.endpoints
 ];
diff --git a/lib/runtime/src/service.rs b/lib/runtime/src/service.rs
diff --git a/lib/runtime/src/transports/nats.rs b/lib/runtime/src/transports/nats.rs