jwadow · kilhyeonjun · Mar 24, 2026 · Mar 25, 2026
diff --git a/kiro/config.py b/kiro/config.py
@@ -416,6 +416,13 @@ def _warn_timeout_configuration():
 # Default: 4000 tokens
 FAKE_REASONING_MAX_TOKENS: int = int(os.getenv("FAKE_REASONING_MAX_TOKENS", "4000"))
 
+# Maximum budget cap for fake reasoning when client sends thinking.budget_tokens.
+# Fake reasoning uses output tokens (not separate thinking tokens), so large budgets
+# can cause the model to spend all output tokens on reasoning with nothing left for actual content.
+# This caps the client's budget_tokens to prevent that. Set to 0 to disable capping.
+# Default: 10000 tokens
+FAKE_REASONING_BUDGET_CAP: int = int(os.getenv("FAKE_REASONING_BUDGET_CAP", "10000"))
+
 # How to handle the thinking block in responses:
 # - "as_reasoning_content": Extract to reasoning_content field (OpenAI-compatible, recommended)
 # - "remove": Remove thinking block completely, return only final answer

diff --git a/kiro/converters_core.py b/kiro/converters_core.py
@@ -40,6 +40,7 @@
     TOOL_DESCRIPTION_MAX_LENGTH,
     FAKE_REASONING_ENABLED,
     FAKE_REASONING_MAX_TOKENS,
+    FAKE_REASONING_BUDGET_CAP,
 )
 
 
@@ -325,7 +326,7 @@ def get_truncation_recovery_system_addition() -> str:
     )
 
 
-def inject_thinking_tags(content: str) -> str:
+def inject_thinking_tags(content: str, max_tokens: Optional[int] = None) -> str:
     """
     Inject fake reasoning tags into content.
 
@@ -335,13 +336,21 @@ def inject_thinking_tags(content: str) -> str:
 
     Args:
         content: Original content string
+        max_tokens: Override for max thinking tokens (from client request).
+                    If None, uses FAKE_REASONING_MAX_TOKENS from config.
 
     Returns:
         Content with thinking tags prepended (if enabled) or original content
     """
     if not FAKE_REASONING_ENABLED:
         return content
 
+    effective_max_tokens = max_tokens if max_tokens is not None else FAKE_REASONING_MAX_TOKENS
+    # Cap client budget to prevent fake reasoning from consuming all output tokens
+    if FAKE_REASONING_BUDGET_CAP > 0 and effective_max_tokens > FAKE_REASONING_BUDGET_CAP:
+        logger.debug(f"Capping fake reasoning budget from {effective_max_tokens} to {FAKE_REASONING_BUDGET_CAP}")
+        effective_max_tokens = FAKE_REASONING_BUDGET_CAP
+
     # Thinking instruction to improve reasoning quality
     thinking_instruction = (
         "Think in English for better reasoning quality.\n\n"
@@ -357,11 +366,11 @@ def inject_thinking_tags(content: str) -> str:
 
     thinking_prefix = (
         f"<thinking_mode>enabled</thinking_mode>\n"
-        f"<max_thinking_length>{FAKE_REASONING_MAX_TOKENS}</max_thinking_length>\n"
+        f"<max_thinking_length>{effective_max_tokens}</max_thinking_length>\n"
         f"<thinking_instruction>{thinking_instruction}</thinking_instruction>\n\n"
     )
 
-    logger.debug(f"Injecting fake reasoning tags with max_tokens={FAKE_REASONING_MAX_TOKENS}")
+    logger.debug(f"Injecting fake reasoning tags with max_tokens={effective_max_tokens}")
 
     return thinking_prefix + content
 
@@ -1344,7 +1353,8 @@ def build_kiro_payload(
     tools: Optional[List[UnifiedTool]],
     conversation_id: str,
     profile_arn: str,
-    inject_thinking: bool = True
+    inject_thinking: bool = True,
+    thinking_budget: Optional[int] = None
 ) -> KiroPayloadResult:
     """
     Builds complete payload for Kiro API from unified data.
@@ -1483,7 +1493,7 @@ def build_kiro_payload(
 
     # Inject thinking tags if enabled (only for the current/last user message)
     if inject_thinking and current_message.role == "user":
-        current_content = inject_thinking_tags(current_content)
+        current_content = inject_thinking_tags(current_content, max_tokens=thinking_budget)
 
     # Build userInputMessage
     user_input_message = {

diff --git a/kiro/converters_openai.py b/kiro/converters_openai.py
@@ -334,6 +334,13 @@ def build_kiro_payload(
         f"system_prompt_length={len(system_prompt)}"
     )
 
+    # Extract thinking budget from request if provided
+    thinking_budget = None
+    if request_data.thinking and isinstance(request_data.thinking, dict):
+        thinking_budget = request_data.thinking.get("budget_tokens")
+        if thinking_budget:
+            logger.debug(f"Client requested thinking budget: {thinking_budget}")
+
     # Use core function to build payload
     result = core_build_kiro_payload(
         messages=unified_messages,
@@ -342,7 +349,8 @@ def build_kiro_payload(
         tools=unified_tools,
         conversation_id=conversation_id,
         profile_arn=profile_arn,
-        inject_thinking=True
+        inject_thinking=True,
+        thinking_budget=thinking_budget
     )
 
     return result.payload
diff --git a/kiro/models_openai.py b/kiro/models_openai.py
@@ -164,6 +164,9 @@ class ChatCompletionRequest(BaseModel):
     presence_penalty: Optional[float] = None
     frequency_penalty: Optional[float] = None
 
+    # Extended thinking (OpenAI-compatible)
+    thinking: Optional[Dict[str, Any]] = None
+
     # Tools (function calling)
     tools: Optional[List[Tool]] = None
     tool_choice: Optional[Union[str, Dict]] = None

diff --git a/tests/unit/test_converters_core.py b/tests/unit/test_converters_core.py
@@ -3576,7 +3576,8 @@ def test_uses_configured_max_tokens(self):
         print("Action: Inject thinking tags with FAKE_REASONING_MAX_TOKENS=16000...")
         with patch('kiro.converters_core.FAKE_REASONING_ENABLED', True):
             with patch('kiro.converters_core.FAKE_REASONING_MAX_TOKENS', 16000):
-                result = inject_thinking_tags(content)
+                with patch('kiro.converters_core.FAKE_REASONING_BUDGET_CAP', 0):
+                    result = inject_thinking_tags(content)
 
         print(f"Result: {result[:300]}...")
         print("Checking that max_thinking_length uses configured value...")