Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions kiro/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,13 @@ def _warn_timeout_configuration():
# Default: 4000 tokens
FAKE_REASONING_MAX_TOKENS: int = int(os.getenv("FAKE_REASONING_MAX_TOKENS", "4000"))

# Maximum budget cap for fake reasoning when client sends thinking.budget_tokens.
# Fake reasoning uses output tokens (not separate thinking tokens), so large budgets
# can cause the model to spend all output tokens on reasoning with nothing left for actual content.
# This caps the client's budget_tokens to prevent that. Set to 0 to disable capping.
# Default: 10000 tokens
FAKE_REASONING_BUDGET_CAP: int = int(os.getenv("FAKE_REASONING_BUDGET_CAP", "10000"))

# How to handle the thinking block in responses:
# - "as_reasoning_content": Extract to reasoning_content field (OpenAI-compatible, recommended)
# - "remove": Remove thinking block completely, return only final answer
Expand Down
20 changes: 15 additions & 5 deletions kiro/converters_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
TOOL_DESCRIPTION_MAX_LENGTH,
FAKE_REASONING_ENABLED,
FAKE_REASONING_MAX_TOKENS,
FAKE_REASONING_BUDGET_CAP,
)


Expand Down Expand Up @@ -325,7 +326,7 @@ def get_truncation_recovery_system_addition() -> str:
)


def inject_thinking_tags(content: str) -> str:
def inject_thinking_tags(content: str, max_tokens: Optional[int] = None) -> str:
"""
Inject fake reasoning tags into content.

Expand All @@ -335,13 +336,21 @@ def inject_thinking_tags(content: str) -> str:

Args:
content: Original content string
max_tokens: Override for max thinking tokens (from client request).
If None, uses FAKE_REASONING_MAX_TOKENS from config.

Returns:
Content with thinking tags prepended (if enabled) or original content
"""
if not FAKE_REASONING_ENABLED:
return content

effective_max_tokens = max_tokens if max_tokens is not None else FAKE_REASONING_MAX_TOKENS
# Cap client budget to prevent fake reasoning from consuming all output tokens
if FAKE_REASONING_BUDGET_CAP > 0 and effective_max_tokens > FAKE_REASONING_BUDGET_CAP:
logger.debug(f"Capping fake reasoning budget from {effective_max_tokens} to {FAKE_REASONING_BUDGET_CAP}")
effective_max_tokens = FAKE_REASONING_BUDGET_CAP

# Thinking instruction to improve reasoning quality
thinking_instruction = (
"Think in English for better reasoning quality.\n\n"
Expand All @@ -357,11 +366,11 @@ def inject_thinking_tags(content: str) -> str:

thinking_prefix = (
f"<thinking_mode>enabled</thinking_mode>\n"
f"<max_thinking_length>{FAKE_REASONING_MAX_TOKENS}</max_thinking_length>\n"
f"<max_thinking_length>{effective_max_tokens}</max_thinking_length>\n"
f"<thinking_instruction>{thinking_instruction}</thinking_instruction>\n\n"
)

logger.debug(f"Injecting fake reasoning tags with max_tokens={FAKE_REASONING_MAX_TOKENS}")
logger.debug(f"Injecting fake reasoning tags with max_tokens={effective_max_tokens}")

return thinking_prefix + content

Expand Down Expand Up @@ -1344,7 +1353,8 @@ def build_kiro_payload(
tools: Optional[List[UnifiedTool]],
conversation_id: str,
profile_arn: str,
inject_thinking: bool = True
inject_thinking: bool = True,
thinking_budget: Optional[int] = None
) -> KiroPayloadResult:
"""
Builds complete payload for Kiro API from unified data.
Expand Down Expand Up @@ -1483,7 +1493,7 @@ def build_kiro_payload(

# Inject thinking tags if enabled (only for the current/last user message)
if inject_thinking and current_message.role == "user":
current_content = inject_thinking_tags(current_content)
current_content = inject_thinking_tags(current_content, max_tokens=thinking_budget)

# Build userInputMessage
user_input_message = {
Expand Down
10 changes: 9 additions & 1 deletion kiro/converters_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,13 @@ def build_kiro_payload(
f"system_prompt_length={len(system_prompt)}"
)

# Extract thinking budget from request if provided
thinking_budget = None
if request_data.thinking and isinstance(request_data.thinking, dict):
thinking_budget = request_data.thinking.get("budget_tokens")
if thinking_budget:
logger.debug(f"Client requested thinking budget: {thinking_budget}")

# Use core function to build payload
result = core_build_kiro_payload(
messages=unified_messages,
Expand All @@ -342,7 +349,8 @@ def build_kiro_payload(
tools=unified_tools,
conversation_id=conversation_id,
profile_arn=profile_arn,
inject_thinking=True
inject_thinking=True,
thinking_budget=thinking_budget
)

return result.payload
3 changes: 3 additions & 0 deletions kiro/models_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,9 @@ class ChatCompletionRequest(BaseModel):
presence_penalty: Optional[float] = None
frequency_penalty: Optional[float] = None

# Extended thinking (OpenAI-compatible)
thinking: Optional[Dict[str, Any]] = None

# Tools (function calling)
tools: Optional[List[Tool]] = None
tool_choice: Optional[Union[str, Dict]] = None
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/test_converters_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3576,7 +3576,8 @@ def test_uses_configured_max_tokens(self):
print("Action: Inject thinking tags with FAKE_REASONING_MAX_TOKENS=16000...")
with patch('kiro.converters_core.FAKE_REASONING_ENABLED', True):
with patch('kiro.converters_core.FAKE_REASONING_MAX_TOKENS', 16000):
result = inject_thinking_tags(content)
with patch('kiro.converters_core.FAKE_REASONING_BUDGET_CAP', 0):
result = inject_thinking_tags(content)

print(f"Result: {result[:300]}...")
print("Checking that max_thinking_length uses configured value...")
Expand Down
Loading