diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml
index 2b19741f7b..16ad395046 100644
--- a/deploy/docker-compose.yml
+++ b/deploy/docker-compose.yml
@@ -15,10 +15,8 @@
 
 # IMPORT NOTE: Make sure this is in sync with lib/runtime/docker-compose.yml
 networks:
-  server:
-    driver: bridge
-  monitoring:
-    driver: bridge
+  dynamo:
+     driver: bridge
 
 # Note that the images are pinned to specific versions to avoid breaking changes.
 services:
@@ -30,8 +28,7 @@ services:
       - 6222:6222
       - 8222:8222  # the endpoints include /varz, /healthz, ...
     networks:
-      - server
-      - monitoring
+      - dynamo
 
   etcd-server:
     image: bitnamilegacy/etcd:3.6.1
@@ -41,9 +38,7 @@ services:
       - 2379:2379  # this port exposes the /metrics endpoint
       - 2380:2380
     networks:
-      - server
-      - monitoring
-
+      - dynamo
   # All the services below are part of the metrics profile and monitoring network.
 
   # The exporter translates from /varz and other stats to Prometheus metrics
@@ -53,7 +48,7 @@ services:
     ports:
       - 7777:7777
     networks:
-      - monitoring
+      - dynamo
     profiles: [metrics]
     depends_on:
       - nats-server
@@ -84,7 +79,7 @@ services:
       - DCGM_EXPORTER_LISTEN=:9401
     runtime: nvidia  # Specify the NVIDIA runtime
     networks:
-      - monitoring
+      - dynamo
 
   # To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu:
   # sudo ufw allow 9090/tcp
@@ -104,7 +99,7 @@ services:
     # Example to pull from the /query endpoint:
     # {__name__=~"DCGM.*", job="dcgm-exporter"}
     networks:
-      - monitoring
+      - dynamo
     ports:
       - "9090:9090"
     profiles: [metrics]
@@ -143,7 +138,7 @@ services:
     ports:
       - "3001:3001"
     networks:
-      - monitoring
+      - dynamo
     profiles: [metrics]
     depends_on:
       - prometheus
diff --git a/examples/multimodal/components/processor.py b/examples/multimodal/components/processor.py
index b972220f5c..718aa83c23 100644
--- a/examples/multimodal/components/processor.py
+++ b/examples/multimodal/components/processor.py
@@ -15,6 +15,7 @@
 
 import argparse
 import asyncio
+import copy
 import json
 import logging
 import os
@@ -22,7 +23,7 @@
 import sys
 import uuid
 from enum import Enum
-from typing import AsyncIterator, Tuple, Union
+from typing import AsyncIterator, Optional, Tuple, Union
 
 import uvloop
 from transformers import AutoTokenizer
@@ -32,11 +33,11 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import FlexibleArgumentParser
 
-from dynamo.llm import ModelInput, ModelType, register_llm
+from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
 from dynamo.runtime import Client, DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging
+from dynamo._core import parse_tool_calls_py
 
-# To import example local module
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
 from utils.args import Config, base_parse_args, parse_endpoint
 from utils.chat_processor import ChatProcessor, CompletionsProcessor, ProcessMixIn
@@ -65,7 +66,7 @@ class Processor(ProcessMixIn):
     def parse_args(cls) -> Tuple[argparse.Namespace, Config]:
         DYN_NAMESPACE = os.environ.get("DYN_NAMESPACE", "dynamo")
         DEFAULT_ENDPOINT = f"dyn://{DYN_NAMESPACE}.processor.generate"
-        DEFAULT_DOWNSTREAM_ENDPOINT = f"dyn://{DYN_NAMESPACE}.encoder.generate"
+        DEFAULT_DOWNSTREAM_ENDPOINT = f"dyn://{DYN_NAMESPACE}.llm.generate"
 
         parser = FlexibleArgumentParser(
             description="vLLM based processor for Dynamo LLM."
@@ -93,7 +94,7 @@ def parse_args(cls) -> Tuple[argparse.Namespace, Config]:
             "--downstream-endpoint",
             type=str,
             default=DEFAULT_DOWNSTREAM_ENDPOINT,
-            help=f"The endpoint string of the downstream encoder in 'dyn://namespace.component.endpoint' format. Default: '{DEFAULT_DOWNSTREAM_ENDPOINT}'",
+            help=f"The endpoint string of the downstream LLM worker in 'dyn://namespace.component.endpoint' format. Default: '{DEFAULT_DOWNSTREAM_ENDPOINT}'",
         )
 
         args, config = base_parse_args(parser)
@@ -104,23 +105,26 @@ def __init__(
         self,
         args: argparse.Namespace,
         engine_args: AsyncEngineArgs,
-        encode_worker_client: Client,
+        llm_worker_client: Client,
+        custom_template_path: Optional[str] = None,
+        tool_call_parser: Optional[str] = None,
     ):
-        self.encode_worker_client = encode_worker_client
+        self.llm_worker_client = llm_worker_client
         self.prompt_template = args.prompt_template
         self.engine_args = engine_args
         self.model_config = self.engine_args.create_model_config()
         self.default_sampling_params = self.model_config.get_diff_sampling_param()
-        self.tokenizer = self._create_tokenizer(self.engine_args)
+        self.tokenizer = self._create_tokenizer(self.engine_args, custom_template_path)
         self.chat_processor = ChatProcessor(self.tokenizer, self.model_config)
         self.completions_processor = CompletionsProcessor(
             self.tokenizer, self.model_config
         )
+        self.tool_call_parser = tool_call_parser
 
     def cleanup(self):
         pass
 
-    def _create_tokenizer(self, engine_args: AsyncEngineArgs) -> AnyTokenizer:
+    def _create_tokenizer(self, engine_args: AsyncEngineArgs, custom_template_path: Optional[str] = None) -> AnyTokenizer:
         """Create a TokenizerGroup using engine arguments similar to VLLM's approach"""
         model_path = engine_args.model
 
@@ -132,6 +136,16 @@ def _create_tokenizer(self, engine_args: AsyncEngineArgs) -> AnyTokenizer:
             truncation_side="left",
             use_fast=True,  # VLLM might use the fast tokenizer for efficiency
         )
+        # Store custom template path but DON'T set it as default on the tokenizer
+        # We'll apply it conditionally using shallow copy (thread-safe)
+        # The template itself handles whether tools are present or not with {% if tools %} logic
+        if custom_template_path:
+            logger.info(f"Custom chat template path: {custom_template_path}")
+            with open(custom_template_path, 'r') as f:
+                self.custom_tool_template = f.read()
+            logger.info("Custom chat template loaded (will be used for all requests when specified)")
+        else:
+            self.custom_tool_template = None
         return base_tokenizer
 
     # Main method to parse the request and send the request to the vllm worker.
@@ -142,7 +156,7 @@ async def _generate(
         request_type: RequestType,
     ):
         request_id = str(uuid.uuid4().hex)
-        logger.debug(f"Got raw request: {raw_request}")
+        logger.info(f"Got raw request: {raw_request}")
         (
             request,
             conversation,
@@ -162,7 +176,7 @@ async def _generate(
         # This API could accept Pydantic class, but SamplingParams
         # in vLLMMultimodalRequest is not a Pydantic class and will
         # cause TypeError: unsupported type SamplingParams
-        response_generator = await self.encode_worker_client.round_robin(
+        response_generator = await self.llm_worker_client.round_robin(
             worker_request.model_dump_json()
         )
 
@@ -211,35 +225,143 @@ async def generate(self, raw_request: MultiModalRequest):
             # If the request is not MultiModalRequest, convert it to MultiModalRequest
             raw_request = MultiModalRequest.model_validate(raw_request)
 
-        # Ensure the configured template includes the placeholder
-        template = self.prompt_template
-        if "<prompt>" not in template:
-            raise ValueError("prompt_template must contain '<prompt>' placeholder")
-
-        # Safely extract user text
-        try:
-            user_text = raw_request.messages[0].content[0].text
-        except (IndexError, AttributeError) as e:
-            raise ValueError(f"Invalid message structure: {e}")
-
-        prompt = template.replace("<prompt>", user_text)
-
-        msg = {
-            "role": "user",
-            "content": prompt,
-        }
-
-        # Set stream=True - the http frontend will handle aggregation of
-        # streamed chunks into a single http response, or stream them
-        # back as SSE responses based on the stream flag in the request.
-        chat_request = ChatCompletionRequest(
-            model=raw_request.model,
-            messages=[msg],
-            stream=True,
-            max_tokens=raw_request.max_tokens,
-            temperature=raw_request.temperature,
-            request_id=str(uuid.uuid4()),
-        )
+        # If tools are provided, apply the chat template with tools
+        # We need to apply the Jinja template but NOT process images (keep them as URLs)
+        if raw_request.tools and len(raw_request.tools) > 0:
+            # Convert messages and tools to dicts for template rendering
+            messages_for_template = []
+            for msg in raw_request.messages:
+                # Flatten multi-part content into a single text string for template
+                content_parts = []
+                for content in msg.content:
+                    if content.type == "text":
+                        content_parts.append(content.text)
+                    elif content.type == "image_url":
+                        # Use Qwen's vision tokens
+                        content_parts.append("<|vision_start|><|image_pad|><|vision_end|>")
+                    elif content.type == "video_url":
+                        # Use similar format for video if needed
+                        content_parts.append("<|vision_start|><|video_pad|><|vision_end|>")
+                
+                messages_for_template.append({
+                    "role": msg.role,
+                    "content": "".join(content_parts)  # Join without spaces to keep tokens together
+                })
+            
+            # Convert tools to dicts
+            tools_dicts = [tool.model_dump() if hasattr(tool, 'model_dump') else tool for tool in raw_request.tools]
+            
+            # Apply the custom tool calling template
+            if self.custom_tool_template:
+                # THREAD-SAFE: Create a shallow copy to avoid race conditions when mutating chat_template
+                # Shallow copy is fast (~1-5 µs) and only copies attribute references, not large objects like vocab
+                temp_tokenizer = copy.copy(self.tokenizer)
+                temp_tokenizer.chat_template = self.custom_tool_template
+                prompt = temp_tokenizer.apply_chat_template(
+                    messages_for_template,
+                    tools=tools_dicts,
+                    add_generation_prompt=True,
+                    tokenize=False
+                )
+            else:
+                # Use default tokenizer template with tools
+                prompt = self.tokenizer.apply_chat_template(
+                    messages_for_template,
+                    tools=tools_dicts,
+                    add_generation_prompt=True,
+                    tokenize=False
+                )
+            
+            # Create a simple message with the formatted prompt (like the original path)
+            msg = {
+                "role": "user",
+                "content": prompt,
+            }
+            
+            chat_request = ChatCompletionRequest(
+                model=raw_request.model,
+                messages=[msg],
+                stream=raw_request.stream,
+                max_tokens=raw_request.max_tokens,
+                temperature=raw_request.temperature,
+                request_id=str(uuid.uuid4()),
+                tools=raw_request.tools,
+                tool_choice=raw_request.tool_choice,
+            )
+        else:
+            # Check if this is a multimodal request (has images/video)
+            has_multimodal = any(
+                item.type in ["image_url", "video_url"] 
+                for msg in raw_request.messages 
+                for item in msg.content
+            )
+            
+            if has_multimodal:
+                # Original path: manual template replacement for multimodal non-tool calls
+                # Ensure the configured template includes the placeholder
+                template = self.prompt_template
+                if "<prompt>" not in template:
+                    raise ValueError("prompt_template must contain '<prompt>' placeholder")
+
+                # Extract all text from content items (handles image-only, text-only, or mixed)
+                text_parts = []
+                for item in raw_request.messages[0].content:
+                    if item.type == "text":
+                        text_parts.append(item.text)
+                
+                # Use empty string if no text (image-only case)
+                user_text = " ".join(text_parts) if text_parts else ""
+
+                prompt = template.replace("<prompt>", user_text)
+
+                msg = {
+                    "role": "user",
+                    "content": prompt,
+                }
+
+                chat_request = ChatCompletionRequest(
+                    model=raw_request.model,
+                    messages=[msg],
+                    stream=raw_request.stream,
+                    max_tokens=raw_request.max_tokens,
+                    temperature=raw_request.temperature,
+                    request_id=str(uuid.uuid4()),
+                    tools=raw_request.tools,
+                    tool_choice=raw_request.tool_choice,
+                )
+            else:
+                # Text-only chat: use tokenizer's chat template
+                messages_for_template = []
+                for msg in raw_request.messages:
+                    # Flatten content to string
+                    content_text = " ".join([item.text for item in msg.content if item.type == "text"])
+                    messages_for_template.append({
+                        "role": msg.role,
+                        "content": content_text
+                    })
+                
+                # Apply chat template
+                prompt = self.tokenizer.apply_chat_template(
+                    messages_for_template,
+                    add_generation_prompt=True,
+                    tokenize=False
+                )
+                
+                msg = {
+                    "role": "user",
+                    "content": prompt,
+                }
+
+                chat_request = ChatCompletionRequest(
+                    model=raw_request.model,
+                    messages=[msg],
+                    stream=raw_request.stream,
+                    max_tokens=raw_request.max_tokens,
+                    temperature=raw_request.temperature,
+                    request_id=str(uuid.uuid4()),
+                    tools=raw_request.tools,
+                    tool_choice=raw_request.tool_choice,
+                )
         multimodal_input = MultiModalInput()
 
         for message in raw_request.messages:
@@ -251,9 +373,13 @@ async def generate(self, raw_request: MultiModalRequest):
                         raise ValueError("Cannot provide both image and video URLs")
                     multimodal_input.video_url = item.video_url.url
 
-        if multimodal_input.image_url is None and multimodal_input.video_url is None:
-            raise ValueError("Either image URL or video URL is required")
+        # Allow text-only messages (no image/video required)
+        # This enables both pure text chat and multimodal use cases
 
+        # Buffer chunks when tool calling is enabled to clear content after parsing
+        accumulated_content = ""
+        buffered_chunks = []
+        
         async for response in self._generate(
             chat_request, multimodal_input, RequestType.CHAT
         ):
@@ -263,8 +389,86 @@ async def generate(self, raw_request: MultiModalRequest):
             # reconstructing back the OpenAI chat response as dynamo egress expects it
             if response.startswith("data: [DONE]"):
                 break
-            response = json.loads(response.lstrip("data: "))
-            yield response
+            
+            # Handle both streaming (with "data: " prefix) and non-streaming responses
+            if response.startswith("data: "):
+                response = json.loads(response.lstrip("data: "))
+            else:
+                response = json.loads(response)
+                # Convert non-streaming format (message) to streaming format (delta)
+                if "choices" in response and "message" in response["choices"][0]:
+                    message_content = response["choices"][0]["message"]["content"]
+                    response["choices"][0]["delta"] = {"content": message_content, "role": "assistant"}
+                    del response["choices"][0]["message"]
+                    response["object"] = "chat.completion.chunk"
+            
+            # Buffer chunks and accumulate content when tool calling is configured
+            if (
+                self.tool_call_parser
+                and raw_request.tools
+                and "choices" in response
+                and len(response["choices"]) > 0
+            ):
+                choice = response["choices"][0]
+                
+                # Buffer this chunk
+                buffered_chunks.append(response)
+                
+                # Accumulate delta content
+                if "delta" in choice and choice["delta"].get("content"):
+                    accumulated_content += choice["delta"]["content"]
+                
+                # Parse when we hit the end (finish_reason is set)
+                finish_reason = choice.get("finish_reason")
+                if finish_reason == "stop":
+                    if accumulated_content:
+                        logger.info(f"Attempting to parse accumulated tool calls (length={len(accumulated_content)}) with parser: {self.tool_call_parser}")
+                        try:
+                            tool_calls, normal_text = parse_tool_calls_py(accumulated_content, self.tool_call_parser)
+                            logger.info(f"Parse result: {len(tool_calls) if tool_calls else 0} tool calls found")
+                            
+                            if tool_calls:
+                                # Convert tool calls to OpenAI format
+                                tool_call_chunks = []
+                                for idx, tc in enumerate(tool_calls):
+                                    tool_call_chunks.append({
+                                        "index": idx,
+                                        "id": tc["id"],
+                                        "type": tc["type"],
+                                        "function": {
+                                            "name": tc["function"]["name"],
+                                            "arguments": tc["function"]["arguments"]
+                                        }
+                                    })
+                                
+                                # Clear content from ALL buffered chunks (per OpenAI spec)
+                                for buffered_chunk in buffered_chunks:
+                                    if "choices" in buffered_chunk and len(buffered_chunk["choices"]) > 0:
+                                        buffered_choice = buffered_chunk["choices"][0]
+                                        if "delta" in buffered_choice:
+                                            buffered_choice["delta"]["content"] = ""
+                                        elif "message" in buffered_choice:
+                                            buffered_choice["message"]["content"] = ""
+                                
+                                # Add tool_calls to the final chunk
+                                if "delta" in choice:
+                                    choice["delta"]["tool_calls"] = tool_call_chunks
+                                elif "message" in choice:
+                                    choice["message"]["tool_calls"] = tool_call_chunks
+                                
+                                choice["finish_reason"] = "tool_calls"
+                                logger.info(f"Cleared content from {len(buffered_chunks)} chunks and added {len(tool_calls)} tool call(s) to final chunk")
+                        except Exception as e:
+                            logger.warning(f"Failed to parse tool calls: {e}", exc_info=True)
+                            # Continue with original response if parsing fails
+                    
+                    # Yield all buffered chunks now that we've processed them
+                    for chunk in buffered_chunks:
+                        yield chunk
+                    buffered_chunks = []
+            else:
+                # No tool calling, yield immediately
+                yield response
 
 
 async def graceful_shutdown(runtime):
@@ -311,26 +515,39 @@ async def init(runtime: DistributedRuntime, args: argparse.Namespace, config: Co
     parsed_namespace, parsed_component_name, parsed_endpoint_name = parse_endpoint(
         args.downstream_endpoint
     )
-    encode_worker_client = (
+    llm_worker_client = (
         await runtime.namespace(parsed_namespace)
         .component(parsed_component_name)
         .endpoint(parsed_endpoint_name)
         .client()
     )
 
-    handler = Processor(args, config.engine_args, encode_worker_client)
+    handler = Processor(
+        args,
+        config.engine_args,
+        llm_worker_client,
+        config.custom_jinja_template,
+        config.tool_call_parser,
+    )
 
-    logger.info("Waiting for Encoder Worker Instances ...")
-    await encode_worker_client.wait_for_instances()
+    logger.info("Waiting for LLM Worker Instances ...")
+    await llm_worker_client.wait_for_instances()
 
     # Register the endpoint as entrypoint to a model
+    logger.info(f"Config: {config.tool_call_parser}, {config.reasoning_parser}, {config.custom_jinja_template}")
+    runtime_config = ModelRuntimeConfig()
+    runtime_config.tool_call_parser = config.tool_call_parser
+    runtime_config.reasoning_parser = config.reasoning_parser
+
     await register_llm(
-        ModelInput.Text,  # Custom processor is used and this type bypasses SDK processor
+        ModelInput.Text,
         ModelType.Chat,
         generate_endpoint,
         config.model,
         config.served_model_name,
         kv_cache_block_size=config.engine_args.block_size,
+        runtime_config=runtime_config,
+        custom_template_path=config.custom_jinja_template,
     )
 
     logger.info(f"Starting to serve the {args.endpoint} endpoint...")
diff --git a/examples/multimodal/components/worker.py b/examples/multimodal/components/worker.py
index c3258fdbfb..6e17311ab2 100644
--- a/examples/multimodal/components/worker.py
+++ b/examples/multimodal/components/worker.py
@@ -44,7 +44,7 @@ class VllmBaseWorker:
     @classmethod
     def parse_args(cls) -> Tuple[argparse.Namespace, Config]:
         parser = FlexibleArgumentParser(
-            description="vLLM based encoder for Dynamo LLM."
+            description="vLLM based worker for Dynamo LLM."
         )
         parser.add_argument(
             "--endpoint",
@@ -270,38 +270,42 @@ async def generate(self, request: vLLMMultimodalRequest):
             request.multimodal_input.image_url is None
             and request.multimodal_input.video_url is None
         ):
-            # Process embeddings using the connector
-            # Create a descriptor based on the embedding shape.
-            embeddings = torch.empty(
-                request.embeddings_shape,
-                dtype=self.EMBEDDINGS_DTYPE,
-                device=self.EMBEDDINGS_DEVICE,
-            )
-            descriptor = connect.Descriptor(embeddings)
-
-            if descriptor is None:
-                raise RuntimeError(
-                    "Descriptor is None in PD worker - cannot process embeddings"
+            # Check if embeddings are provided via connector (for disaggregated serving)
+            if request.embeddings_shape is not None:
+                # Process embeddings using the connector
+                # Create a descriptor based on the embedding shape.
+                embeddings = torch.empty(
+                    request.embeddings_shape,
+                    dtype=self.EMBEDDINGS_DTYPE,
+                    device=self.EMBEDDINGS_DEVICE,
                 )
-
-            read_op = await self._connector.begin_read(
-                request.serialized_request, descriptor
-            )
-            await read_op.wait_for_completion()
-            if "video" in self.engine_args.model.lower():
-                video_numpy = embeddings.numpy()
-                multi_modal_data = construct_mm_data(
-                    self.engine_args.model,
-                    self.EMBEDDINGS_DTYPE,
-                    video_numpy=video_numpy,
+                descriptor = connect.Descriptor(embeddings)
+                if descriptor is None:
+                    raise RuntimeError(
+                        "Descriptor is None in PD worker - cannot process embeddings"
+                    )
+
+                read_op = await self._connector.begin_read(
+                    request.serialized_request, descriptor
                 )
+                await read_op.wait_for_completion()
+                if "video" in self.engine_args.model.lower():
+                    video_numpy = embeddings.numpy()
+                    multi_modal_data = construct_mm_data(
+                        self.engine_args.model,
+                        self.EMBEDDINGS_DTYPE,
+                        video_numpy=video_numpy,
+                    )
+                else:
+                    multi_modal_data = construct_mm_data(
+                        self.engine_args.model,
+                        self.EMBEDDINGS_DTYPE,
+                        image_embeds=embeddings,
+                        image_grid_thw=request.image_grid_thw,
+                    )
             else:
-                multi_modal_data = construct_mm_data(
-                    self.engine_args.model,
-                    self.EMBEDDINGS_DTYPE,
-                    image_embeds=embeddings,
-                    image_grid_thw=request.image_grid_thw,
-                )
+                # Text-only request: no multimodal data
+                multi_modal_data = None
         else:
             # Use PIL image instead of image embeddings
             multi_modal_data = {
diff --git a/examples/multimodal/launch/agg.sh b/examples/multimodal/launch/agg.sh
index 8a5a908142..d48afeb996 100755
--- a/examples/multimodal/launch/agg.sh
+++ b/examples/multimodal/launch/agg.sh
@@ -8,6 +8,8 @@ trap 'echo Cleaning up...; kill 0' EXIT
 MODEL_NAME="llava-hf/llava-1.5-7b-hf"
 PROMPT_TEMPLATE="USER: <image>\n<prompt> ASSISTANT:"
 PROVIDED_PROMPT_TEMPLATE=""
+TOOL_CALL_PARSER=""
+CUSTOM_TEMPLATE=""
 
 # Parse command line arguments
 while [[ $# -gt 0 ]]; do
@@ -20,12 +22,30 @@ while [[ $# -gt 0 ]]; do
             PROVIDED_PROMPT_TEMPLATE=$2
             shift 2
             ;;
+        --dyn-tool-call-parser)
+            TOOL_CALL_PARSER=$2
+            shift 2
+            ;;
+        --custom-jinja-template)
+            CUSTOM_TEMPLATE=$2
+            shift 2
+            ;;
         -h|--help)
             echo "Usage: $0 [OPTIONS]"
             echo "Options:"
-            echo "  --model <model_name> Specify the model to use (default: $MODEL_NAME)"
-            echo "  --prompt-template <template> Specify the multi-modal prompt template to use. LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates."
-            echo "  -h, --help           Show this help message"
+            echo "  --model <model_name>                Specify the model to use (default: $MODEL_NAME)"
+            echo "  --prompt-template <template>        Specify the multi-modal prompt template to use. LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates."
+            echo "  --dyn-tool-call-parser <parser>     Tool call parser name (e.g., hermes, llama3_json, etc.)"
+            echo "  --custom-jinja-template <path>      Path to custom Jinja chat template file"
+            echo "  -h, --help                          Show this help message"
+            echo ""
+            echo "Description:"
+            echo "  Simplified launch script that passes image URLs directly to the worker,"
+            echo "  skipping the separate encode worker to save GPU resources."
+            echo "  The worker uses its built-in vision encoder."
+            echo ""
+            echo "Tool Calling Example:"
+            echo "  $0 --model Qwen/Qwen2.5-VL-7B-Instruct --dyn-tool-call-parser hermes --custom-jinja-template tool_chat_template_hermes.jinja"
             exit 0
             ;;
         *)
@@ -52,15 +72,35 @@ else
     exit 1
 fi
 
-# run ingress
+# run frontend
 python -m dynamo.frontend --http-port=8000 &
 
 # run processor
-python3 components/processor.py --model $MODEL_NAME --prompt-template "$PROMPT_TEMPLATE" &
+PROCESSOR_ARGS=(
+    --model "$MODEL_NAME"
+    --prompt-template "$PROMPT_TEMPLATE"
+    --downstream-endpoint "dyn://dynamo.llm.generate"
+)
+
+# Add optional tool call parser if specified
+if [[ -n "$TOOL_CALL_PARSER" ]]; then
+    PROCESSOR_ARGS+=(--dyn-tool-call-parser "$TOOL_CALL_PARSER")
+fi
+
+# Add optional custom template if specified
+if [[ -n "$CUSTOM_TEMPLATE" ]]; then
+    PROCESSOR_ARGS+=(--custom-jinja-template "$CUSTOM_TEMPLATE")
+fi
+
+python3 components/processor.py "${PROCESSOR_ARGS[@]}" &
+
+# run worker
+CUDA_VISIBLE_DEVICES=0 python3 components/worker.py \
+    --model $MODEL_NAME \
+    --max-model-len 12000 \
+    --worker-type prefill &
 
-# run E/P/D workers
-CUDA_VISIBLE_DEVICES=0 python3 components/encode_worker.py --model $MODEL_NAME &
-CUDA_VISIBLE_DEVICES=1 python3 components/worker.py --model $MODEL_NAME --worker-type prefill &
+# chat template /workspace/tool_chat_template_hermes.jinja
 
 # Wait for all background processes to complete
 wait
diff --git a/examples/multimodal/launch/test-requests.sh b/examples/multimodal/launch/test-requests.sh
new file mode 100644
index 0000000000..3f02b0d811
--- /dev/null
+++ b/examples/multimodal/launch/test-requests.sh
@@ -0,0 +1,478 @@
+#!/bin/bash
+################################################################################
+# 🧪 Comprehensive test suite for multimodal chat completions API
+# Tests various combinations of inputs (text, image) with and without tool calling
+#
+# Test Coverage:
+# 1️⃣ 🖼️💬🔧 Image + Text + Tool Calling
+# 2️⃣ 💬🔧 Text + Tool Calling
+# 3️⃣ 🖼️💬 Image + Text (No Tools)
+# 4️⃣ 💬 Text Only
+# 5️⃣ 🖼️ Image Only
+# 6️⃣ 💬🔧📡 Text + Tool Calling (STREAMING)
+# 7️⃣ 🖼️💬📡 Image + Text (STREAMING)
+# 8️⃣ 💬📡 Text Only (STREAMING)
+################################################################################
+
+# API Configuration
+BASE_URL="http://localhost:8000/v1/chat/completions"
+MODEL="Qwen/Qwen2.5-VL-7B-Instruct"
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+################################################################################
+# Helper Functions
+################################################################################
+
+make_request() {
+    local payload="$1"
+    local test_name="$2"
+    
+    echo "================================================================================"
+    echo "🧪 TEST: $test_name"
+    echo "================================================================================"
+    
+    # Make the request and capture response
+    http_code=$(curl -s -w "%{http_code}" -o /tmp/response.json \
+        -X POST "$BASE_URL" \
+        -H "Content-Type: application/json" \
+        -d "$payload")
+    
+    # Check if request was successful
+    if [ "$http_code" -eq 200 ]; then
+        echo -e "${GREEN}✅ Status Code: $http_code${NC}"
+        echo ""
+        
+        # Check if response is streaming (SSE format with "data:" lines)
+        if head -n 1 /tmp/response.json | grep -q "^data:"; then
+            echo "📥 Response (STREAMING - SSE format):"
+            echo ""
+            
+            # Parse streaming response
+            echo "📡 Stream Chunks:"
+            chunk_count=0
+            accumulated_content=""
+            
+            while IFS= read -r line; do
+                if [[ "$line" == data:* ]]; then
+                    # Remove "data: " prefix
+                    json_data="${line#data: }"
+                    
+                    if [[ "$json_data" == "[DONE]" ]]; then
+                        echo -e "${BLUE}  [DONE]${NC}"
+                        break
+                    fi
+                    
+                    chunk_count=$((chunk_count + 1))
+                    
+                    if command -v jq &> /dev/null; then
+                        # Extract delta content if present
+                        delta_content=$(echo "$json_data" | jq -r '.choices[0].delta.content // empty' 2>/dev/null)
+                        if [ -n "$delta_content" ]; then
+                            accumulated_content="${accumulated_content}${delta_content}"
+                            echo -n "$delta_content"
+                        fi
+                        
+                        # Check for tool calls in final chunk
+                        tool_calls=$(echo "$json_data" | jq -r '.choices[0].delta.tool_calls // empty' 2>/dev/null)
+                        if [ -n "$tool_calls" ] && [ "$tool_calls" != "null" ]; then
+                            echo ""
+                            echo ""
+                            echo "🛠️  Tool Calls Detected:"
+                            echo "$json_data" | jq -r '.choices[0].delta.tool_calls[]? | "  🔧 Function: " + .function.name + "\n     📋 Arguments: " + .function.arguments' 2>/dev/null
+                        fi
+                    fi
+                fi
+            done < /tmp/response.json
+            
+            echo ""
+            echo ""
+            echo "📊 Summary:"
+            echo "  Total chunks: $chunk_count"
+            if [ -n "$accumulated_content" ]; then
+                content_length=${#accumulated_content}
+                echo "  Content length: $content_length characters"
+                echo ""
+                echo "📝 Full Content:"
+                echo "$accumulated_content"
+            fi
+        else
+            # Non-streaming response
+            echo "📥 Response:"
+            
+            # Pretty print the response
+            if command -v jq &> /dev/null; then
+                cat /tmp/response.json | jq '.'
+                
+                # Extract and display key information
+                echo ""
+                echo "💬 Message Summary"
+                role=$(cat /tmp/response.json | jq -r '.choices[0].message.role // "N/A"')
+                echo "Role: $role"
+                
+                # Display content if present
+                content=$(cat /tmp/response.json | jq -r '.choices[0].message.content // ""')
+                if [ -n "$content" ] && [ "$content" != "null" ]; then
+                    content_preview=$(echo "$content" | head -c 200)
+                    if [ ${#content} -gt 200 ]; then
+                        echo "📝 Content: ${content_preview}..."
+                    else
+                        echo "📝 Content: $content"
+                    fi
+                fi
+                
+                # Display tool calls if present
+                tool_calls=$(cat /tmp/response.json | jq -r '.choices[0].message.tool_calls // empty')
+                if [ -n "$tool_calls" ] && [ "$tool_calls" != "null" ]; then
+                    echo ""
+                    echo "🛠️  Tool Calls"
+                    cat /tmp/response.json | jq -r '.choices[0].message.tool_calls[] | "  [" + (.id // "N/A") + "] 🔧 Function: " + .function.name + "\n      📋 Arguments: " + .function.arguments'
+                fi
+            else
+                # Fallback if jq is not available
+                cat /tmp/response.json
+            fi
+        fi
+        
+        echo ""
+        echo -e "${GREEN}✅ Test '$test_name' PASSED${NC}"
+    else
+        echo -e "${RED}❌ HTTP Error${NC}"
+        echo -e "${RED}❌ Status Code: $http_code${NC}"
+        echo -e "${RED}❌ Response:${NC}"
+        cat /tmp/response.json
+        echo ""
+        echo -e "${RED}❌ Test '$test_name' FAILED${NC}"
+    fi
+    
+    echo ""
+}
+
+################################################################################
+# Test 1: Image + Text + Tool Calling (general)
+################################################################################
+test_1_image_text_tool_calling() {
+    local payload=$(cat <<'EOF'
+{
+  "model": "Qwen/Qwen2.5-VL-7B-Instruct",
+  "messages": [
+    {
+      "role": "user",
+      "content": [
+        {
+          "type": "text",
+          "text": "Describe what you see in this image in detail."
+        },
+        {
+          "type": "image_url",
+          "image_url": {
+            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
+          }
+        }
+      ]
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "describe_image",
+        "description": "Provides detailed description of objects and scenes in an image",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "objects": {
+              "type": "array",
+              "items": {"type": "string"},
+              "description": "List of objects detected in the image"
+            },
+            "scene": {
+              "type": "string",
+              "description": "Overall scene description"
+            }
+          },
+          "required": ["objects", "scene"]
+        }
+      }
+    }
+  ],
+  "tool_choice": "auto",
+  "max_tokens": 1024
+}
+EOF
+)
+    make_request "$payload" "1️⃣ 🖼️ 💬 🔧 Image + Text + Tool Calling"
+}
+
+################################################################################
+# Test 2: Text + Tool Calling
+################################################################################
+test_2_text_tool_calling() {
+    local payload=$(cat <<'EOF'
+{
+  "model": "Qwen/Qwen2.5-VL-7B-Instruct",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What is the weather like in San Francisco?"
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "location": {
+              "type": "string",
+              "description": "The city and state, e.g. San Francisco, CA"
+            },
+            "unit": {
+              "type": "string",
+              "enum": ["celsius", "fahrenheit"],
+              "description": "The temperature unit to use"
+            }
+          },
+          "required": ["location"]
+        }
+      }
+    }
+  ],
+  "tool_choice": "auto",
+  "max_tokens": 512
+}
+EOF
+)
+    make_request "$payload" "2️⃣ 💬 🔧 Text + Tool Calling"
+}
+
+################################################################################
+# Test 3: Image + Text (no tools)
+################################################################################
+test_3_image_text() {
+    local payload=$(cat <<'EOF'
+{
+  "model": "Qwen/Qwen2.5-VL-7B-Instruct",
+  "messages": [
+    {
+      "role": "user",
+      "content": [
+        {
+          "type": "text",
+          "text": "What is in this image?"
+        },
+        {
+          "type": "image_url",
+          "image_url": {
+            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
+          }
+        }
+      ]
+    }
+  ],
+  "max_tokens": 300,
+  "temperature": 0.0,
+  "stream": false
+}
+EOF
+)
+    make_request "$payload" "3️⃣ 🖼️ 💬 Image + Text"
+}
+
+################################################################################
+# Test 4: Text Only
+################################################################################
+test_4_text_only() {
+    local payload=$(cat <<'EOF'
+{
+  "model": "Qwen/Qwen2.5-VL-7B-Instruct",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Explain the concept of neural networks in simple terms."
+    }
+  ],
+  "max_tokens": 300,
+  "temperature": 0.0,
+  "stream": false
+}
+EOF
+)
+    make_request "$payload" "4️⃣ 💬 Text Only"
+}
+
+################################################################################
+# Test 5: Image Only
+################################################################################
+test_5_image_only() {
+    local payload=$(cat <<'EOF'
+{
+  "model": "Qwen/Qwen2.5-VL-7B-Instruct",
+  "messages": [
+    {
+      "role": "user",
+      "content": [
+        {
+          "type": "image_url",
+          "image_url": {
+            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
+          }
+        }
+      ]
+    }
+  ],
+  "max_tokens": 300,
+  "temperature": 0.0,
+  "stream": false
+}
+EOF
+)
+    make_request "$payload" "5️⃣ 🖼️ Image Only"
+}
+
+################################################################################
+# Test 6: Text + Tool Calling (STREAMING)
+################################################################################
+test_6_text_tool_calling_stream() {
+    local payload=$(cat <<'EOF'
+{
+  "model": "Qwen/Qwen2.5-VL-7B-Instruct",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What is the weather like in San Francisco?"
+    }
+  ],
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "location": {
+              "type": "string",
+              "description": "The city and state, e.g. San Francisco, CA"
+            },
+            "unit": {
+              "type": "string",
+              "enum": ["celsius", "fahrenheit"],
+              "description": "The temperature unit to use"
+            }
+          },
+          "required": ["location"]
+        }
+      }
+    }
+  ],
+  "tool_choice": "auto",
+  "max_tokens": 512,
+  "stream": true
+}
+EOF
+)
+    make_request "$payload" "6️⃣ 💬 🔧 📡 Text + Tool Calling (STREAMING)"
+}
+
+################################################################################
+# Test 7: Image + Text (STREAMING)
+################################################################################
+test_7_image_text_stream() {
+    local payload=$(cat <<'EOF'
+{
+  "model": "Qwen/Qwen2.5-VL-7B-Instruct",
+  "messages": [
+    {
+      "role": "user",
+      "content": [
+        {
+          "type": "text",
+          "text": "What is in this image?"
+        },
+        {
+          "type": "image_url",
+          "image_url": {
+            "url": "http://images.cocodataset.org/test2017/000000155781.jpg"
+          }
+        }
+      ]
+    }
+  ],
+  "max_tokens": 300,
+  "temperature": 0.0,
+  "stream": true
+}
+EOF
+)
+    make_request "$payload" "7️⃣ 🖼️ 💬 📡 Image + Text (STREAMING)"
+}
+
+################################################################################
+# Test 8: Text Only (STREAMING)
+################################################################################
+test_8_text_only_stream() {
+    local payload=$(cat <<'EOF'
+{
+  "model": "Qwen/Qwen2.5-VL-7B-Instruct",
+  "messages": [
+    {
+      "role": "user",
+      "content": "Explain the concept of neural networks in simple terms."
+    }
+  ],
+  "max_tokens": 300,
+  "temperature": 0.0,
+  "stream": true
+}
+EOF
+)
+    make_request "$payload" "8️⃣ 💬 📡 Text Only (STREAMING)"
+}
+
+################################################################################
+# Main Test Runner
+################################################################################
+main() {
+    echo ""
+    echo "================================================================================"
+    echo "🚀 COMPREHENSIVE MULTIMODAL API TEST SUITE 🚀"
+    echo "================================================================================"
+    echo ""
+    
+    # Check for jq
+    if ! command -v jq &> /dev/null; then
+        echo -e "${YELLOW}⚠️  Warning: 'jq' is not installed. Output formatting will be limited.${NC}"
+        echo -e "${YELLOW}   Install with: sudo apt-get install jq (Debian/Ubuntu) or brew install jq (macOS)${NC}"
+        echo ""
+    fi
+    
+    # Run all tests
+    test_1_image_text_tool_calling
+    test_2_text_tool_calling
+    test_3_image_text
+    test_4_text_only
+    test_5_image_only
+    test_6_text_tool_calling_stream
+    test_7_image_text_stream
+    test_8_text_only_stream
+    
+    # Cleanup
+    rm -f /tmp/response.json
+    
+    echo "================================================================================"
+    echo "🎉 ALL TESTS COMPLETED 🎉"
+    echo "================================================================================"
+}
+
+# Run main function
+main
\ No newline at end of file
diff --git a/examples/multimodal/utils/args.py b/examples/multimodal/utils/args.py
index 7b70d93bd2..a9e1678c8f 100644
--- a/examples/multimodal/utils/args.py
+++ b/examples/multimodal/utils/args.py
@@ -2,19 +2,23 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
-import json
 import logging
 import os
 import socket
-import sys
-import time
 from typing import Callable, List, Optional, Tuple
 
 from vllm.config import KVTransferConfig
 from vllm.distributed.kv_events import KVEventsConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 
+from dynamo._core import get_reasoning_parser_names, get_tool_parser_names
 from dynamo.runtime import DistributedRuntime
+from dynamo.vllm.ports import (
+    DynamoPortRange,
+    PortAllocationRequest,
+    PortMetadata,
+    allocate_and_reserve_port_block,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -31,6 +35,8 @@ class Config:
     endpoint: str
     kv_port: Optional[int] = None
     side_channel_port: Optional[int] = None
+    port_range: DynamoPortRange = DynamoPortRange(min=20000, max=30000)
+    custom_jinja_template: Optional[str] = None
 
     # mirror vLLM
     model: str
@@ -39,6 +45,10 @@ class Config:
     # rest vLLM args
     engine_args: AsyncEngineArgs
 
+    # tool and reasoning parser info
+    tool_call_parser: Optional[str] = None
+    reasoning_parser: Optional[str] = None
+
 
 def parse_endpoint(endpoint: str) -> List[str]:
     endpoint_str = endpoint.replace("dyn://", "", 1)
@@ -77,6 +87,39 @@ def base_parse_args(
             default=DEFAULT_ENDPOINT,
             help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
         )
+    parser.add_argument(
+        "--dynamo-port-min",
+        type=int,
+        default=20000,
+        help="Minimum port number for Dynamo services (default: 20000). Must be in registered ports range (1024-49151).",
+    )
+    parser.add_argument(
+        "--dynamo-port-max",
+        type=int,
+        default=30000,
+        help="Maximum port number for Dynamo services (default: 30000). Must be in registered ports range (1024-49151).",
+    )
+    # To avoid name conflicts with different backends, adopted prefix "dyn-" for dynamo specific args
+    parser.add_argument(
+        "--dyn-tool-call-parser",
+        type=str,
+        default=None,
+        choices=get_tool_parser_names(),
+        help="Tool call parser name for the model.",
+    )
+    parser.add_argument(
+        "--dyn-reasoning-parser",
+        type=str,
+        default=None,
+        choices=get_reasoning_parser_names(),
+        help="Reasoning parser name for the model. If not specified, no reasoning parsing is performed.",
+    )
+    parser.add_argument(
+        "--custom-jinja-template",
+        type=str,
+        default=None,
+        help="Path to a custom Jinja template file to override the model's default chat template. This template will take precedence over any template found in the model repository.",
+    )
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
     engine_args = AsyncEngineArgs.from_cli_args(args)
@@ -105,6 +148,23 @@ def base_parse_args(
     config.component = parsed_component_name
     config.endpoint = parsed_endpoint_name
     config.engine_args = engine_args
+    config.port_range = DynamoPortRange(min=args.dynamo_port_min, max=args.dynamo_port_max)
+    config.tool_call_parser = args.dyn_tool_call_parser
+    config.reasoning_parser = args.dyn_reasoning_parser
+    config.custom_jinja_template = args.custom_jinja_template
+
+    # Validate custom Jinja template file exists if provided
+    if config.custom_jinja_template is not None:
+        # Expand environment variables and user home (~) before validation
+        expanded_template_path = os.path.expanduser(
+            os.path.expandvars(config.custom_jinja_template)
+        )
+        config.custom_jinja_template = expanded_template_path
+        if not os.path.isfile(expanded_template_path):
+            raise FileNotFoundError(
+                f"Custom Jinja template file not found: {expanded_template_path}. "
+                f"Please ensure the file exists and the path is correct."
+            )
 
     if config.engine_args.block_size is None:
         config.engine_args.block_size = 16
@@ -120,30 +180,18 @@ async def allocate_and_reserve_port(
     namespace: str,
     worker_id: str,
     reason: str,
+    port_range: DynamoPortRange,
 ) -> int:
     """
     Get an OS-assigned port and atomically reserve it.
-    Retries until successful or internal max attempts reached.
     """
-
-    context_json = {
-        "worker_id": worker_id,
-        "reason": reason,
-        "reserved_at": time.time(),
-        "pid": os.getpid(),
-        "block_size": 1,
-    }
-
-    # Any ephemeral port, equivalent to binding port 0
-    port_range_min = 32_768
-    port_range_max = 60_999
-    allocated_ports = await runtime.allocate_port_block(
-        namespace,
-        port_range_min,
-        port_range_max,
-        1,  # how many ports to allocate
-        json.dumps(context_json),
+    metadata = PortMetadata(worker_id=worker_id, reason=reason)
+    request = PortAllocationRequest(
+        metadata=metadata,
+        port_range=port_range,
+        block_size=1,
     )
+    allocated_ports = await allocate_and_reserve_port_block(runtime, namespace, request)
     if not allocated_ports:
         raise RuntimeError("allocate_port_block returned no ports")
     port = allocated_ports[0]
@@ -164,6 +212,7 @@ async def configure_ports(runtime: DistributedRuntime, config: Config):
         namespace=config.namespace,
         worker_id=f"{worker_id}",
         reason="zmq_kv_event_port",
+        port_range=config.port_range,
     )
 
     # Allocate side channel port
@@ -172,6 +221,7 @@ async def configure_ports(runtime: DistributedRuntime, config: Config):
         namespace=config.namespace,
         worker_id=f"{worker_id}",
         reason="nixl_side_channel_port",
+        port_range=config.port_range,
     )
 
     # Update config with allocated ports
diff --git a/examples/multimodal/utils/chat_processor.py b/examples/multimodal/utils/chat_processor.py
index fe8d95dc81..51a995bdeb 100644
--- a/examples/multimodal/utils/chat_processor.py
+++ b/examples/multimodal/utils/chat_processor.py
@@ -146,6 +146,11 @@ async def preprocess(self, raw_request: ChatCompletionRequest) -> PreprocessResu
         else:
             chat_template = request.chat_template or self.tokenizer.chat_template
 
+        # Convert tools to dict format if present
+        tool_dicts = None
+        if request.tools:
+            tool_dicts = [tool.model_dump() if hasattr(tool, 'model_dump') else tool for tool in request.tools]
+
         (
             conversation,
             request_prompts,
@@ -158,7 +163,7 @@ async def preprocess(self, raw_request: ChatCompletionRequest) -> PreprocessResu
             chat_template_content_format=self.openai_serving.chat_template_content_format,
             add_generation_prompt=request.add_generation_prompt,
             continue_final_message=request.continue_final_message,
-            tool_dicts=None,
+            tool_dicts=tool_dicts,
             documents=request.documents,
             chat_template_kwargs=request.chat_template_kwargs,
             tool_parser=self.openai_serving.tool_parser,
diff --git a/examples/multimodal/utils/protocol.py b/examples/multimodal/utils/protocol.py
index a2caee1efc..78b4ceb77e 100644
--- a/examples/multimodal/utils/protocol.py
+++ b/examples/multimodal/utils/protocol.py
@@ -23,12 +23,14 @@
 from typing_extensions import NotRequired
 from vllm.inputs.data import TokensPrompt
 from vllm.multimodal.inputs import MultiModalUUIDDict  # noqa: F401
+from vllm.multimodal.inputs import MultiModalDataDict  # noqa: F401
 from vllm.outputs import CompletionOutput
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import PromptLogprobs, RequestMetrics
 
 import dynamo.nixl_connect as connect
 
+from vllm.multimodal.inputs import MultiModalUUIDDict  # noqa: F401
 
 class Request(BaseModel):
     prompt: str
@@ -123,6 +125,15 @@ class VideoContent(BaseModel):
 class ChatMessage(BaseModel):
     role: Literal["user", "system", "assistant"]
     content: List[MessageContent]
+    
+    @field_validator("content", mode="before")
+    @classmethod
+    def convert_string_content_to_list(cls, v):
+        """Convert string content to list format for compatibility with OpenAI API"""
+        if isinstance(v, str):
+            # Convert plain string to TextContent in a list
+            return [{"type": "text", "text": v}]
+        return v
 
 
 class MultiModalRequest(BaseModel):
@@ -132,6 +143,8 @@ class MultiModalRequest(BaseModel):
     max_tokens: Optional[int] = None
     temperature: Optional[float] = None
     stream: Optional[bool] = True
+    tools: Optional[List[Any]] = None
+    tool_choice: Optional[Any] = None
 
 
 class MultiModalInput(BaseModel):
diff --git a/lib/bindings/python/rust/parsers.rs b/lib/bindings/python/rust/parsers.rs
index 76b82fb2a8..d407e919fb 100644
--- a/lib/bindings/python/rust/parsers.rs
+++ b/lib/bindings/python/rust/parsers.rs
@@ -2,8 +2,10 @@
 // SPDX-License-Identifier: Apache-2.0
 
 use dynamo_parsers::reasoning::get_available_reasoning_parsers;
-use dynamo_parsers::tool_calling::parsers::get_available_tool_parsers;
+use dynamo_parsers::tool_calling::parsers::{detect_and_parse_tool_call, get_available_tool_parsers};
+use dynamo_parsers::ToolCallResponse;
 use pyo3::prelude::*;
+use pyo3::types::PyDict;
 
 /// Get list of available  parser names
 #[pyfunction]
@@ -17,9 +19,67 @@ pub fn get_reasoning_parser_names() -> Vec<&'static str> {
     get_available_reasoning_parsers()
 }
 
+/// Parse tool calls from message content using the specified parser
+/// 
+/// Args:
+///     message: The message content to parse (string containing tool calls)
+///     parser_name: Optional parser name (e.g., "hermes", "llama3_json", etc.)
+///                  If None, uses default parser
+/// 
+/// Returns:
+///     A tuple of (tool_calls, normal_text) where:
+///     - tool_calls: List of dicts with keys: id, type, function (dict with name, arguments)
+///     - normal_text: Optional string of non-tool-call content
+#[pyfunction]
+#[pyo3(signature = (message, parser_name=None))]
+fn parse_tool_calls_py<'py>(
+    py: Python<'py>,
+    message: &str,
+    parser_name: Option<&str>,
+) -> PyResult<(Vec<Bound<'py, PyDict>>, Option<String>)> {
+    // Call the async Rust parser in a blocking way
+    let result = pyo3_async_runtimes::tokio::get_runtime()
+        .block_on(async { detect_and_parse_tool_call(message, parser_name).await });
+
+    match result {
+        Ok((tool_calls, normal_text)) => {
+            // Convert Vec<ToolCallResponse> to Vec<PyDict>
+            let py_tool_calls: Vec<Bound<'py, PyDict>> = tool_calls
+                .into_iter()
+                .map(|tc| tool_call_to_dict(py, tc))
+                .collect::<PyResult<Vec<_>>>()?;
+            Ok((py_tool_calls, normal_text))
+        }
+        Err(e) => Err(pyo3::exceptions::PyValueError::new_err(format!(
+            "Failed to parse tool calls: {}",
+            e
+        ))),
+    }
+}
+
+/// Convert a ToolCallResponse to a Python dict
+fn tool_call_to_dict<'py>(py: Python<'py>, tc: ToolCallResponse) -> PyResult<Bound<'py, PyDict>> {
+    let dict = PyDict::new(py);
+    dict.set_item("id", tc.id)?;
+    
+    // Convert ToolCallType enum to string
+    let type_str = match tc.tp {
+        dynamo_parsers::ToolCallType::Function => "function",
+    };
+    dict.set_item("type", type_str)?;
+    
+    let function_dict = PyDict::new(py);
+    function_dict.set_item("name", tc.function.name)?;
+    function_dict.set_item("arguments", tc.function.arguments)?;
+    dict.set_item("function", function_dict)?;
+    
+    Ok(dict)
+}
+
 /// Add parsers module functions to the Python module
 pub fn add_to_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(get_tool_parser_names, m)?)?;
     m.add_function(wrap_pyfunction!(get_reasoning_parser_names, m)?)?;
+    m.add_function(wrap_pyfunction!(parse_tool_calls_py, m)?)?;
     Ok(())
-}
+}
\ No newline at end of file