diff --git a/kiro/config.py b/kiro/config.py index 9f1f6ce0..3014ae2d 100644 --- a/kiro/config.py +++ b/kiro/config.py @@ -416,6 +416,13 @@ def _warn_timeout_configuration(): # Default: 4000 tokens FAKE_REASONING_MAX_TOKENS: int = int(os.getenv("FAKE_REASONING_MAX_TOKENS", "4000")) +# Maximum budget cap for fake reasoning when client sends thinking.budget_tokens. +# Fake reasoning uses output tokens (not separate thinking tokens), so large budgets +# can cause the model to spend all output tokens on reasoning with nothing left for actual content. +# This caps the client's budget_tokens to prevent that. Set to 0 to disable capping. +# Default: 10000 tokens +FAKE_REASONING_BUDGET_CAP: int = int(os.getenv("FAKE_REASONING_BUDGET_CAP", "10000")) + # How to handle the thinking block in responses: # - "as_reasoning_content": Extract to reasoning_content field (OpenAI-compatible, recommended) # - "remove": Remove thinking block completely, return only final answer diff --git a/kiro/converters_core.py b/kiro/converters_core.py index 21cf758b..4a053a09 100644 --- a/kiro/converters_core.py +++ b/kiro/converters_core.py @@ -40,6 +40,7 @@ TOOL_DESCRIPTION_MAX_LENGTH, FAKE_REASONING_ENABLED, FAKE_REASONING_MAX_TOKENS, + FAKE_REASONING_BUDGET_CAP, ) @@ -325,7 +326,7 @@ def get_truncation_recovery_system_addition() -> str: ) -def inject_thinking_tags(content: str) -> str: +def inject_thinking_tags(content: str, max_tokens: Optional[int] = None) -> str: """ Inject fake reasoning tags into content. @@ -335,6 +336,8 @@ def inject_thinking_tags(content: str) -> str: Args: content: Original content string + max_tokens: Override for max thinking tokens (from client request). + If None, uses FAKE_REASONING_MAX_TOKENS from config. Returns: Content with thinking tags prepended (if enabled) or original content @@ -342,6 +345,12 @@ def inject_thinking_tags(content: str) -> str: if not FAKE_REASONING_ENABLED: return content + effective_max_tokens = max_tokens if max_tokens is not None else FAKE_REASONING_MAX_TOKENS + # Cap client budget to prevent fake reasoning from consuming all output tokens + if FAKE_REASONING_BUDGET_CAP > 0 and effective_max_tokens > FAKE_REASONING_BUDGET_CAP: + logger.debug(f"Capping fake reasoning budget from {effective_max_tokens} to {FAKE_REASONING_BUDGET_CAP}") + effective_max_tokens = FAKE_REASONING_BUDGET_CAP + # Thinking instruction to improve reasoning quality thinking_instruction = ( "Think in English for better reasoning quality.\n\n" @@ -357,11 +366,11 @@ def inject_thinking_tags(content: str) -> str: thinking_prefix = ( f"enabled\n" - f"{FAKE_REASONING_MAX_TOKENS}\n" + f"{effective_max_tokens}\n" f"{thinking_instruction}\n\n" ) - logger.debug(f"Injecting fake reasoning tags with max_tokens={FAKE_REASONING_MAX_TOKENS}") + logger.debug(f"Injecting fake reasoning tags with max_tokens={effective_max_tokens}") return thinking_prefix + content @@ -1344,7 +1353,8 @@ def build_kiro_payload( tools: Optional[List[UnifiedTool]], conversation_id: str, profile_arn: str, - inject_thinking: bool = True + inject_thinking: bool = True, + thinking_budget: Optional[int] = None ) -> KiroPayloadResult: """ Builds complete payload for Kiro API from unified data. @@ -1483,7 +1493,7 @@ def build_kiro_payload( # Inject thinking tags if enabled (only for the current/last user message) if inject_thinking and current_message.role == "user": - current_content = inject_thinking_tags(current_content) + current_content = inject_thinking_tags(current_content, max_tokens=thinking_budget) # Build userInputMessage user_input_message = { diff --git a/kiro/converters_openai.py b/kiro/converters_openai.py index aad3b83c..c6eaf273 100644 --- a/kiro/converters_openai.py +++ b/kiro/converters_openai.py @@ -334,6 +334,13 @@ def build_kiro_payload( f"system_prompt_length={len(system_prompt)}" ) + # Extract thinking budget from request if provided + thinking_budget = None + if request_data.thinking and isinstance(request_data.thinking, dict): + thinking_budget = request_data.thinking.get("budget_tokens") + if thinking_budget: + logger.debug(f"Client requested thinking budget: {thinking_budget}") + # Use core function to build payload result = core_build_kiro_payload( messages=unified_messages, @@ -342,7 +349,8 @@ def build_kiro_payload( tools=unified_tools, conversation_id=conversation_id, profile_arn=profile_arn, - inject_thinking=True + inject_thinking=True, + thinking_budget=thinking_budget ) return result.payload \ No newline at end of file diff --git a/kiro/models_openai.py b/kiro/models_openai.py index 46167c1b..cf8e1491 100644 --- a/kiro/models_openai.py +++ b/kiro/models_openai.py @@ -164,6 +164,9 @@ class ChatCompletionRequest(BaseModel): presence_penalty: Optional[float] = None frequency_penalty: Optional[float] = None + # Extended thinking (OpenAI-compatible) + thinking: Optional[Dict[str, Any]] = None + # Tools (function calling) tools: Optional[List[Tool]] = None tool_choice: Optional[Union[str, Dict]] = None diff --git a/tests/unit/test_converters_core.py b/tests/unit/test_converters_core.py index c042dd8b..f800a183 100644 --- a/tests/unit/test_converters_core.py +++ b/tests/unit/test_converters_core.py @@ -3576,7 +3576,8 @@ def test_uses_configured_max_tokens(self): print("Action: Inject thinking tags with FAKE_REASONING_MAX_TOKENS=16000...") with patch('kiro.converters_core.FAKE_REASONING_ENABLED', True): with patch('kiro.converters_core.FAKE_REASONING_MAX_TOKENS', 16000): - result = inject_thinking_tags(content) + with patch('kiro.converters_core.FAKE_REASONING_BUDGET_CAP', 0): + result = inject_thinking_tags(content) print(f"Result: {result[:300]}...") print("Checking that max_thinking_length uses configured value...")