diff --git a/kiro/config.py b/kiro/config.py
index 9f1f6ce0..3014ae2d 100644
--- a/kiro/config.py
+++ b/kiro/config.py
@@ -416,6 +416,13 @@ def _warn_timeout_configuration():
# Default: 4000 tokens
FAKE_REASONING_MAX_TOKENS: int = int(os.getenv("FAKE_REASONING_MAX_TOKENS", "4000"))
+# Maximum budget cap for fake reasoning when client sends thinking.budget_tokens.
+# Fake reasoning uses output tokens (not separate thinking tokens), so large budgets
+# can cause the model to spend all output tokens on reasoning with nothing left for actual content.
+# This caps the client's budget_tokens to prevent that. Set to 0 to disable capping.
+# Default: 10000 tokens
+FAKE_REASONING_BUDGET_CAP: int = int(os.getenv("FAKE_REASONING_BUDGET_CAP", "10000"))
+
# How to handle the thinking block in responses:
# - "as_reasoning_content": Extract to reasoning_content field (OpenAI-compatible, recommended)
# - "remove": Remove thinking block completely, return only final answer
diff --git a/kiro/converters_core.py b/kiro/converters_core.py
index 21cf758b..4a053a09 100644
--- a/kiro/converters_core.py
+++ b/kiro/converters_core.py
@@ -40,6 +40,7 @@
TOOL_DESCRIPTION_MAX_LENGTH,
FAKE_REASONING_ENABLED,
FAKE_REASONING_MAX_TOKENS,
+ FAKE_REASONING_BUDGET_CAP,
)
@@ -325,7 +326,7 @@ def get_truncation_recovery_system_addition() -> str:
)
-def inject_thinking_tags(content: str) -> str:
+def inject_thinking_tags(content: str, max_tokens: Optional[int] = None) -> str:
"""
Inject fake reasoning tags into content.
@@ -335,6 +336,8 @@ def inject_thinking_tags(content: str) -> str:
Args:
content: Original content string
+ max_tokens: Override for max thinking tokens (from client request).
+ If None, uses FAKE_REASONING_MAX_TOKENS from config.
Returns:
Content with thinking tags prepended (if enabled) or original content
@@ -342,6 +345,12 @@ def inject_thinking_tags(content: str) -> str:
if not FAKE_REASONING_ENABLED:
return content
+ effective_max_tokens = max_tokens if max_tokens is not None else FAKE_REASONING_MAX_TOKENS
+ # Cap client budget to prevent fake reasoning from consuming all output tokens
+ if FAKE_REASONING_BUDGET_CAP > 0 and effective_max_tokens > FAKE_REASONING_BUDGET_CAP:
+ logger.debug(f"Capping fake reasoning budget from {effective_max_tokens} to {FAKE_REASONING_BUDGET_CAP}")
+ effective_max_tokens = FAKE_REASONING_BUDGET_CAP
+
# Thinking instruction to improve reasoning quality
thinking_instruction = (
"Think in English for better reasoning quality.\n\n"
@@ -357,11 +366,11 @@ def inject_thinking_tags(content: str) -> str:
thinking_prefix = (
f"enabled\n"
- f"{FAKE_REASONING_MAX_TOKENS}\n"
+ f"{effective_max_tokens}\n"
f"{thinking_instruction}\n\n"
)
- logger.debug(f"Injecting fake reasoning tags with max_tokens={FAKE_REASONING_MAX_TOKENS}")
+ logger.debug(f"Injecting fake reasoning tags with max_tokens={effective_max_tokens}")
return thinking_prefix + content
@@ -1344,7 +1353,8 @@ def build_kiro_payload(
tools: Optional[List[UnifiedTool]],
conversation_id: str,
profile_arn: str,
- inject_thinking: bool = True
+ inject_thinking: bool = True,
+ thinking_budget: Optional[int] = None
) -> KiroPayloadResult:
"""
Builds complete payload for Kiro API from unified data.
@@ -1483,7 +1493,7 @@ def build_kiro_payload(
# Inject thinking tags if enabled (only for the current/last user message)
if inject_thinking and current_message.role == "user":
- current_content = inject_thinking_tags(current_content)
+ current_content = inject_thinking_tags(current_content, max_tokens=thinking_budget)
# Build userInputMessage
user_input_message = {
diff --git a/kiro/converters_openai.py b/kiro/converters_openai.py
index aad3b83c..c6eaf273 100644
--- a/kiro/converters_openai.py
+++ b/kiro/converters_openai.py
@@ -334,6 +334,13 @@ def build_kiro_payload(
f"system_prompt_length={len(system_prompt)}"
)
+ # Extract thinking budget from request if provided
+ thinking_budget = None
+ if request_data.thinking and isinstance(request_data.thinking, dict):
+ thinking_budget = request_data.thinking.get("budget_tokens")
+ if thinking_budget:
+ logger.debug(f"Client requested thinking budget: {thinking_budget}")
+
# Use core function to build payload
result = core_build_kiro_payload(
messages=unified_messages,
@@ -342,7 +349,8 @@ def build_kiro_payload(
tools=unified_tools,
conversation_id=conversation_id,
profile_arn=profile_arn,
- inject_thinking=True
+ inject_thinking=True,
+ thinking_budget=thinking_budget
)
return result.payload
\ No newline at end of file
diff --git a/kiro/models_openai.py b/kiro/models_openai.py
index 46167c1b..cf8e1491 100644
--- a/kiro/models_openai.py
+++ b/kiro/models_openai.py
@@ -164,6 +164,9 @@ class ChatCompletionRequest(BaseModel):
presence_penalty: Optional[float] = None
frequency_penalty: Optional[float] = None
+ # Extended thinking (OpenAI-compatible)
+ thinking: Optional[Dict[str, Any]] = None
+
# Tools (function calling)
tools: Optional[List[Tool]] = None
tool_choice: Optional[Union[str, Dict]] = None
diff --git a/tests/unit/test_converters_core.py b/tests/unit/test_converters_core.py
index c042dd8b..f800a183 100644
--- a/tests/unit/test_converters_core.py
+++ b/tests/unit/test_converters_core.py
@@ -3576,7 +3576,8 @@ def test_uses_configured_max_tokens(self):
print("Action: Inject thinking tags with FAKE_REASONING_MAX_TOKENS=16000...")
with patch('kiro.converters_core.FAKE_REASONING_ENABLED', True):
with patch('kiro.converters_core.FAKE_REASONING_MAX_TOKENS', 16000):
- result = inject_thinking_tags(content)
+ with patch('kiro.converters_core.FAKE_REASONING_BUDGET_CAP', 0):
+ result = inject_thinking_tags(content)
print(f"Result: {result[:300]}...")
print("Checking that max_thinking_length uses configured value...")