Add max_output_tokens as argument to Response API

abhibongale · abhibongale · commit 5fa97b57cb0b · 2025-10-06T09:46:09.000+01:00
Responses and Completions have a max_output_tokens field. It is currently missing from the create and response object in Responses API. This PR fixes it. fixes: #3562 Signed-off-by: Abhishek Bongale <abhishekbongale@outlook.com>
diff --git a/docs/docs/providers/agents/index.mdx b/docs/docs/providers/agents/index.mdx
@@ -1,7 +1,7 @@
 ---
 description: "Agents
 
-    APIs for creating and interacting with agentic systems."
+APIs for creating and interacting with agentic systems."
 sidebar_label: Agents
 title: Agents
 ---
@@ -12,6 +12,6 @@ title: Agents
 
 Agents
 
-    APIs for creating and interacting with agentic systems.
+APIs for creating and interacting with agentic systems.
 
 This section contains documentation for all available providers for the **agents** API.
diff --git a/docs/docs/providers/batches/index.mdx b/docs/docs/providers/batches/index.mdx
@@ -1,14 +1,14 @@
 ---
 description: "The Batches API enables efficient processing of multiple requests in a single operation,
-    particularly useful for processing large datasets, batch evaluation workflows, and
-    cost-effective inference at scale.
+particularly useful for processing large datasets, batch evaluation workflows, and
+cost-effective inference at scale.
 
-    The API is designed to allow use of openai client libraries for seamless integration.
+The API is designed to allow use of openai client libraries for seamless integration.
 
-    This API provides the following extensions:
-     - idempotent batch creation
+This API provides the following extensions:
+ - idempotent batch creation
 
-    Note: This API is currently under active development and may undergo changes."
+Note: This API is currently under active development and may undergo changes."
 sidebar_label: Batches
 title: Batches
 ---
@@ -18,14 +18,14 @@ title: Batches
 ## Overview
 
 The Batches API enables efficient processing of multiple requests in a single operation,
-    particularly useful for processing large datasets, batch evaluation workflows, and
-    cost-effective inference at scale.
+particularly useful for processing large datasets, batch evaluation workflows, and
+cost-effective inference at scale.
 
-    The API is designed to allow use of openai client libraries for seamless integration.
+The API is designed to allow use of openai client libraries for seamless integration.
 
-    This API provides the following extensions:
-     - idempotent batch creation
+This API provides the following extensions:
+ - idempotent batch creation
 
-    Note: This API is currently under active development and may undergo changes.
+Note: This API is currently under active development and may undergo changes.
 
 This section contains documentation for all available providers for the **batches** API.
diff --git a/docs/docs/providers/inference/index.mdx b/docs/docs/providers/inference/index.mdx
@@ -1,9 +1,9 @@
 ---
 description: "Llama Stack Inference API for generating completions, chat completions, and embeddings.
 
-    This API provides the raw interface to the underlying models. Two kinds of models are supported:
-    - LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
-    - Embedding models: these models generate embeddings to be used for semantic search."
+This API provides the raw interface to the underlying models. Two kinds of models are supported:
+- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.
+- Embedding models: these models generate embeddings to be used for semantic search."
 sidebar_label: Inference
 title: Inference
 ---
@@ -14,8 +14,8 @@ title: Inference
 
 Llama Stack Inference API for generating completions, chat completions, and embeddings.
 
-    This API provides the raw interface to the underlying models. Two kinds of models are supported:
-    - LLM models: these models generate "raw" and "chat" (conversational) completions.
-    - Embedding models: these models generate embeddings to be used for semantic search.
+This API provides the raw interface to the underlying models. Two kinds of models are supported:
+- LLM models: these models generate "raw" and "chat" (conversational) completions.
+- Embedding models: these models generate embeddings to be used for semantic search.
 
 This section contains documentation for all available providers for the **inference** API.
diff --git a/docs/static/deprecated-llama-stack-spec.html b/docs/static/deprecated-llama-stack-spec.html
@@ -9096,6 +9096,10 @@
                         "type": "string",
                         "description": "(Optional) Truncation strategy applied to the response"
                     },
+                    "max_output_tokens": {
+                        "type": "integer",
+                        "description": "(Optional) Upper bound for response tokens generation"
+                    },
                     "input": {
                         "type": "array",
                         "items": {
@@ -9914,6 +9918,9 @@
                     },
                     "max_infer_iters": {
                         "type": "integer"
+                    },
+                    "max_output_tokens": {
+                        "type": "integer"
                     }
                 },
                 "additionalProperties": false,
@@ -9983,6 +9990,10 @@
                     "truncation": {
                         "type": "string",
                         "description": "(Optional) Truncation strategy applied to the response"
+                    },
+                    "max_output_tokens": {
+                        "type": "integer",
+                        "description": "(Optional) Upper bound for response tokens generation"
                     }
                 },
                 "additionalProperties": false,
diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml
@@ -6740,6 +6740,10 @@ components:
           type: string
           description: >-
             (Optional) Truncation strategy applied to the response
+        max_output_tokens:
+          type: integer
+          description: >-
+            (Optional) Upper bound for response tokens generation
         input:
           type: array
           items:
@@ -7351,6 +7355,8 @@ components:
             (Optional) Additional fields to include in the response.
         max_infer_iters:
           type: integer
+        max_output_tokens:
+          type: integer
       additionalProperties: false
       required:
         - input
@@ -7414,6 +7420,10 @@ components:
           type: string
           description: >-
             (Optional) Truncation strategy applied to the response
+        max_output_tokens:
+          type: integer
+          description: >-
+            (Optional) Upper bound for response tokens generation
       additionalProperties: false
       required:
         - created_at
diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html
@@ -7503,6 +7503,10 @@
                         "type": "string",
                         "description": "(Optional) Truncation strategy applied to the response"
                     },
+                    "max_output_tokens": {
+                        "type": "integer",
+                        "description": "(Optional) Upper bound for response tokens generation"
+                    },
                     "input": {
                         "type": "array",
                         "items": {
@@ -8009,6 +8013,9 @@
                     },
                     "max_infer_iters": {
                         "type": "integer"
+                    },
+                    "max_output_tokens": {
+                        "type": "integer"
                     }
                 },
                 "additionalProperties": false,
@@ -8078,6 +8085,10 @@
                     "truncation": {
                         "type": "string",
                         "description": "(Optional) Truncation strategy applied to the response"
+                    },
+                    "max_output_tokens": {
+                        "type": "integer",
+                        "description": "(Optional) Upper bound for response tokens generation"
                     }
                 },
                 "additionalProperties": false,
diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml
@@ -5660,6 +5660,10 @@ components:
           type: string
           description: >-
             (Optional) Truncation strategy applied to the response
+        max_output_tokens:
+          type: integer
+          description: >-
+            (Optional) Upper bound for response tokens generation
         input:
           type: array
           items:
@@ -6014,6 +6018,8 @@ components:
             (Optional) Additional fields to include in the response.
         max_infer_iters:
           type: integer
+        max_output_tokens:
+          type: integer
       additionalProperties: false
       required:
         - input
@@ -6077,6 +6083,10 @@ components:
           type: string
           description: >-
             (Optional) Truncation strategy applied to the response
+        max_output_tokens:
+          type: integer
+          description: >-
+            (Optional) Upper bound for response tokens generation
       additionalProperties: false
       required:
         - created_at
diff --git a/docs/static/stainless-llama-stack-spec.html b/docs/static/stainless-llama-stack-spec.html
@@ -9512,6 +9512,10 @@
                         "type": "string",
                         "description": "(Optional) Truncation strategy applied to the response"
                     },
+                    "max_output_tokens": {
+                        "type": "integer",
+                        "description": "(Optional) Upper bound for response tokens generation"
+                    },
                     "input": {
                         "type": "array",
                         "items": {
@@ -10018,6 +10022,9 @@
                     },
                     "max_infer_iters": {
                         "type": "integer"
+                    },
+                    "max_output_tokens": {
+                        "type": "integer"
                     }
                 },
                 "additionalProperties": false,
@@ -10087,6 +10094,10 @@
                     "truncation": {
                         "type": "string",
                         "description": "(Optional) Truncation strategy applied to the response"
+                    },
+                    "max_output_tokens": {
+                        "type": "integer",
+                        "description": "(Optional) Upper bound for response tokens generation"
                     }
                 },
                 "additionalProperties": false,
diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml
@@ -7105,6 +7105,10 @@ components:
           type: string
           description: >-
             (Optional) Truncation strategy applied to the response
+        max_output_tokens:
+          type: integer
+          description: >-
+            (Optional) Upper bound for response tokens generation
         input:
           type: array
           items:
@@ -7459,6 +7463,8 @@ components:
             (Optional) Additional fields to include in the response.
         max_infer_iters:
           type: integer
+        max_output_tokens:
+          type: integer
       additionalProperties: false
       required:
         - input
@@ -7522,6 +7528,10 @@ components:
           type: string
           description: >-
             (Optional) Truncation strategy applied to the response
+        max_output_tokens:
+          type: integer
+          description: >-
+            (Optional) Upper bound for response tokens generation
       additionalProperties: false
       required:
         - created_at
diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py
@@ -825,6 +825,7 @@ async def create_openai_response(
                 "List of shields to apply during response generation. Shields provide safety and content moderation."
             ),
         ] = None,
+        max_output_tokens: int | None = None,
     ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
         """Create a new OpenAI response.
 
diff --git a/llama_stack/apis/agents/openai_responses.py b/llama_stack/apis/agents/openai_responses.py
@@ -363,6 +363,7 @@ class OpenAIResponseObject(BaseModel):
     :param text: Text formatting configuration for the response
     :param top_p: (Optional) Nucleus sampling parameter used for generation
     :param truncation: (Optional) Truncation strategy applied to the response
+    :param max_output_tokens: (Optional) Upper bound for response tokens generation
     """
 
     created_at: int
@@ -380,6 +381,7 @@ class OpenAIResponseObject(BaseModel):
     text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text"))
     top_p: float | None = None
     truncation: str | None = None
+    max_output_tokens: int | None = None
 
 
 @json_schema_type
diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@@ -204,6 +204,7 @@ async def create_openai_response(
         store: bool | None = True,
         stream: bool | None = False,
         temperature: float | None = None,
+        max_output_tokens: int | None = None,
         text: OpenAIResponseText | None = None,
         tools: list[OpenAIResponseInputTool] | None = None,
         include: list[str] | None = None,
@@ -224,6 +225,7 @@ async def create_openai_response(
             previous_response_id=previous_response_id,
             store=store,
             temperature=temperature,
+            max_output_tokens=max_output_tokens,
             text=text,
             tools=tools,
             max_infer_iters=max_infer_iters,
@@ -252,6 +254,7 @@ async def _create_streaming_response(
         previous_response_id: str | None = None,
         store: bool | None = True,
         temperature: float | None = None,
+        max_output_tokens: int | None = None,
         text: OpenAIResponseText | None = None,
         tools: list[OpenAIResponseInputTool] | None = None,
         max_infer_iters: int | None = 10,
@@ -268,6 +271,7 @@ async def _create_streaming_response(
             messages=messages,
             response_tools=tools,
             temperature=temperature,
+            max_tokens=max_output_tokens,
             response_format=response_format,
             inputs=input,
         )
diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/types.py b/llama_stack/providers/inline/agents/meta_reference/responses/types.py
@@ -63,6 +63,7 @@ class ChatCompletionContext(BaseModel):
     response_format: OpenAIResponseFormatParam
     approval_requests: list[OpenAIResponseMCPApprovalRequest] = []
     approval_responses: dict[str, OpenAIResponseMCPApprovalResponse] = {}
+    max_tokens: int | None = None
 
     def __init__(
         self,
@@ -72,6 +73,7 @@ def __init__(
         temperature: float | None,
         response_format: OpenAIResponseFormatParam,
         inputs: list[OpenAIResponseInput] | str,
+        max_tokens: int | None = None,
     ):
         super().__init__(
             model=model,
diff --git a/tests/integration/agents/test_openai_responses.py b/tests/integration/agents/test_openai_responses.py
@@ -297,3 +297,38 @@ def test_function_call_output_response_with_none_arguments(openai_client, client
     assert response.output[0].type == "function_call"
     assert response.output[0].arguments == "{}"
     _ = response.output[0].call_id
+
+
+def test_response_with_max_output_tokens(compat_client, text_model_id):
+    """Test that the `max_output_tokens` parameter is used."""
+    if not isinstance(compat_client, OpenAI):
+        pytest.skip("This test requires the OpenAI client.")
+
+    response = compat_client.responses.create(
+        model=text_model_id,
+        input=[
+            {
+                "role": "user",
+                "content": "what's the current time? You MUST call the `get_current_time` function to find out.",
+            }
+        ],
+        max_output_tokens=15,
+        stream=False,
+    )
+
+    assert response.id is not None
+    assert response.model == text_model_id
+
+    assert hasattr(response, "max_output_tokens")
+    assert response.max_output_tokens == 15
+
+    output_text = ""
+    for item in response.output:
+        if item.type == "message" and item.role == "assistant":
+            if item.content and item.content.type == "text":
+                output_text = item.content.text
+                break
+
+    assert output_text, "Assistant response content should not be empty"
+
+    assert len(output_text.split()) < 30