ai-dynamo · ishandhanani · Oct 2, 2025 · Oct 3, 2025 · Oct 3, 2025 · Oct 3, 2025
diff --git a/components/backends/sglang/README.md b/components/backends/sglang/README.md
@@ -117,9 +117,7 @@ uv pip install maturin
 cd $DYNAMO_HOME/lib/bindings/python
 maturin develop --uv
 cd $DYNAMO_HOME
-# installs sglang supported version along with dynamo
-# include the prerelease flag to install flashinfer rc versions
-uv pip install --prerelease=allow -e .[sglang]
+uv pip install -e .[sglang]
 ```
 
 </details>

@@ -26,6 +26,7 @@
     MultimodalPrefillWorkerHandler,
     MultimodalProcessorHandler,
     MultimodalWorkerHandler,
+    NativeApiHandler,
     PrefillWorkerHandler,
 )
 
@@ -74,7 +75,13 @@ async def init(runtime: DistributedRuntime, config: Config):
 
     generate_endpoint = component.endpoint(dynamo_args.endpoint)
 
+    # publisher instantiates the metrics and kv event publishers
+    publisher, metrics_task, metrics_labels = await setup_sgl_metrics(
+        engine, config, component, generate_endpoint
+    )
+
     prefill_client = None
+    native_api_tasks = []
     if config.serving_mode == DisaggregationMode.DECODE:
         logging.info("Initializing prefill client")
         prefill_client = (
@@ -83,11 +90,11 @@ async def init(runtime: DistributedRuntime, config: Config):
             .endpoint("generate")
             .client()
         )
-
-    # publisher instantiates the metrics and kv event publishers
-    publisher, metrics_task, metrics_labels = await setup_sgl_metrics(
-        engine, config, component, generate_endpoint
-    )
+    # TODO: implement other native APIs and come up with clean layer to apply to agg/disagg/etc
+    if config.serving_mode == DisaggregationMode.AGGREGATED:
+        native_api_tasks = await NativeApiHandler(
+            component, engine, metrics_labels
+        ).init_native_apis()
 
     # Readiness gate: requests wait until model is registered
     ready_event = asyncio.Event()
@@ -97,7 +104,6 @@ async def init(runtime: DistributedRuntime, config: Config):
     health_check_payload = SglangHealthCheckPayload(engine).to_dict()
 
     try:
-        # Start endpoint immediately and register model concurrently
         # Requests queue until ready_event is set
         await asyncio.gather(
             generate_endpoint.serve_endpoint(
@@ -113,6 +119,7 @@ async def init(runtime: DistributedRuntime, config: Config):
                 dynamo_args,
                 readiness_gate=ready_event,
             ),
+            *native_api_tasks,
         )
     except Exception as e:
         logging.error(f"Failed to serve endpoints: {e}")

@@ -17,6 +17,7 @@
     MultimodalProcessorHandler,
     MultimodalWorkerHandler,
 )
+from .native_api_handler import NativeApiHandler
 
 __all__ = [
     "BaseWorkerHandler",
@@ -28,6 +29,7 @@
     # Multimodal handlers
     "MultimodalEncodeWorkerHandler",
     "MultimodalPrefillWorkerHandler",
+    "NativeApiHandler",
     "MultimodalProcessorHandler",
     "MultimodalWorkerHandler",
 ]
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# SGLang Native APIs: https://docs.sglang.ai/basic_usage/native_api.html
+# Code: https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/entrypoints/http_server.py
+
+import asyncio
+import logging
+from typing import List, Optional, Tuple
+
+import sglang as sgl
+
+from dynamo._core import Component
+
+
+class NativeApiHandler:
+    """Handler to add sglang native API endpoints to workers"""
+
+    def __init__(
+        self,
+        component: Component,
+        engine: sgl.Engine,
+        metrics_labels: Optional[List[Tuple[str, str]]] = None,
+    ):
+        self.component = component
+        self.engine = engine
+        self.metrics_labels = metrics_labels
+        self.native_api_tasks = []
+
+    async def init_native_apis(
+        self,
+    ) -> List[asyncio.Task]:
+        """
+        Initialize and register native API endpoints.
+        Returns list of tasks to be gathered.
+        """
+        logging.info("Initializing native SGLang API endpoints")
+
+        self.tm = self.engine.tokenizer_manager
+
+        tasks = []
+
+        model_info_ep = self.component.endpoint("get_model_info")
+        tasks.extend(
+            [
+                model_info_ep.serve_endpoint(
+                    self.get_model_info,
+                    graceful_shutdown=True,
+                    metrics_labels=self.metrics_labels,
+                    http_endpoint_path="/get_model_info",
+                ),
+            ]
+        )
+
+        self.native_api_tasks = tasks
+        logging.info(f"Registered {len(tasks)} native API endpoints")
+        return tasks
+
+    async def get_model_info(self, request: dict):
+        _ = request
+        result = {
+            "model_path": self.tm.server_args.model_path,
+            "tokenizer_path": self.tm.server_args.tokenizer_path,
+            "preferred_sampling_params": self.tm.server_args.preferred_sampling_params,
+            "weight_version": self.tm.server_args.weight_version,
+        }
+
+        yield {"data": [result]}
@@ -643,14 +643,15 @@ impl Component {
 
 #[pymethods]
 impl Endpoint {
-    #[pyo3(signature = (generator, graceful_shutdown = true, metrics_labels = None, health_check_payload = None))]
+    #[pyo3(signature = (generator, graceful_shutdown = true, metrics_labels = None, health_check_payload = None, http_endpoint_path = None))]
     fn serve_endpoint<'p>(
         &self,
         py: Python<'p>,
         generator: PyObject,
         graceful_shutdown: Option<bool>,
         metrics_labels: Option<Vec<(String, String)>>,
         health_check_payload: Option<&Bound<'p, PyDict>>,
+        http_endpoint_path: Option<&str>,
     ) -> PyResult<Bound<'p, PyAny>> {
         let engine = Arc::new(engine::PythonAsyncEngine::new(
             generator,
@@ -688,6 +689,10 @@ impl Endpoint {
             builder = builder.health_check_payload(payload);
         }
 
+        if let Some(http_endpoint_path) = http_endpoint_path {
+            builder = builder.http_endpoint_path(http_endpoint_path);
+        }
+
         let graceful_shutdown = graceful_shutdown.unwrap_or(true);
         pyo3_async_runtimes::tokio::future_into_py(py, async move {
             builder

diff --git a/lib/bindings/python/src/dynamo/_core.pyi b/lib/bindings/python/src/dynamo/_core.pyi
@@ -115,7 +115,7 @@ class Endpoint:
 
     ...
 
-    async def serve_endpoint(self, handler: RequestHandler, graceful_shutdown: bool = True, metrics_labels: Optional[List[Tuple[str, str]]] = None, health_check_payload: Optional[Dict[str, Any]] = None) -> None:
+    async def serve_endpoint(self, handler: RequestHandler, graceful_shutdown: bool = True, metrics_labels: Optional[List[Tuple[str, str]]] = None, health_check_payload: Optional[Dict[str, Any]] = None, http_endpoint_path: Optional[str] = None) -> None:
         """
         Serve an endpoint discoverable by all connected clients at
         `{{ namespace }}/components/{{ component_name }}/endpoints/{{ endpoint_name }}`

@@ -17,6 +17,7 @@ use crate::{
         completions::{NvCreateCompletionRequest, NvCreateCompletionResponse},
     },
 };
+use dynamo_runtime::component::INSTANCE_ROOT_PATH;
 use dynamo_runtime::transports::etcd;
 use dynamo_runtime::{DistributedRuntime, Runtime};
 use dynamo_runtime::{distributed::DistributedConfig, pipeline::RouterMode};
@@ -55,11 +56,10 @@ pub async fn run(runtime: Runtime, engine_config: EngineConfig) -> anyhow::Resul
     let http_service = match engine_config {
         EngineConfig::Dynamic(_) => {
             let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
-            let etcd_client = distributed_runtime.etcd_client();
             // This allows the /health endpoint to query etcd for active instances
-            http_service_builder = http_service_builder.with_etcd_client(etcd_client.clone());
+            http_service_builder = http_service_builder.with_drt(Some(distributed_runtime.clone()));
             let http_service = http_service_builder.build()?;
-            match etcd_client {
+            match distributed_runtime.etcd_client() {
                 Some(ref etcd_client) => {
                     let router_config = engine_config.local_model().router_config();
                     // Listen for models registering themselves in etcd, add them to HTTP service
@@ -71,7 +71,7 @@ pub async fn run(runtime: Runtime, engine_config: EngineConfig) -> anyhow::Resul
                     } else {
                         Some(namespace.to_string())
                     };
-                    run_watcher(
+                    run_model_watcher(
                         distributed_runtime,
                         http_service.state().manager_clone(),
                         etcd_client.clone(),
@@ -84,6 +84,10 @@ pub async fn run(runtime: Runtime, engine_config: EngineConfig) -> anyhow::Resul
                         http_service.state().metrics_clone(),
                     )
                     .await?;
+
+                    // Start dynamic HTTP endpoint watcher
+                    run_endpoint_watcher(etcd_client.clone(), Arc::new(http_service.clone()))
+                        .await?;
                 }
                 None => {
                     // Static endpoints don't need discovery
@@ -221,7 +225,7 @@ pub async fn run(runtime: Runtime, engine_config: EngineConfig) -> anyhow::Resul
 /// Spawns a task that watches for new models in etcd at network_prefix,
 /// and registers them with the ModelManager so that the HTTP service can use them.
 #[allow(clippy::too_many_arguments)]
-async fn run_watcher(
+async fn run_model_watcher(
     runtime: DistributedRuntime,
     model_manager: Arc<ModelManager>,
     etcd_client: etcd::Client,
@@ -265,6 +269,24 @@ async fn run_watcher(
     Ok(())
 }
 
+/// Spawns a task that watches instance records for dynamic HTTP endpoints and updates the
+/// DynamicEndpointWatcher held in the HTTP service state.
+async fn run_endpoint_watcher(
+    etcd_client: etcd::Client,
+    http_service: Arc<HttpService>,
+) -> anyhow::Result<()> {
+    if let Some(dep_watcher) = http_service.state().dynamic_registry() {
+        let instances_watcher = etcd_client
+            .kv_get_and_watch_prefix(INSTANCE_ROOT_PATH)
+            .await?;
+        let (_prefix2, _watcher2, instances_rx) = instances_watcher.dissolve();
+        tokio::spawn(async move {
+            dep_watcher.watch(instances_rx).await;
+        });
+    }
+    Ok(())
+}
+
 /// Updates HTTP service endpoints based on available model types
 fn update_http_endpoints(service: Arc<HttpService>, model_type: ModelUpdate) {
     tracing::debug!(

@@ -21,6 +21,8 @@
 mod openai;
 
 pub mod disconnect;
+pub mod dynamic_endpoint;
+pub mod dynamic_registry;
 pub mod error;
 pub mod health;
 pub mod metrics;