Skip to content

Delay llm calls #222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@ dependencies = [
"google-genai",
"opentelemetry-instrumentation-google-genai>=0.2b0",
"tensorzero>=2025.4.7",
"google-genai",
"opentelemetry-instrumentation-google-genai>=0.2b0",
"deprecated"
]

[project.optional-dependencies]
Expand Down
2 changes: 2 additions & 0 deletions src/mcp_agent/agents/base_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ def __init__(
# Initialize the LLM to None (will be set by attach_llm)
self._llm: Optional[AugmentedLLMProtocol] = None

self._last_call_timestamp: float | None = None

# Map function names to tools
self._function_tool_map: Dict[str, Any] = {}

Expand Down
8 changes: 8 additions & 0 deletions src/mcp_agent/core/request_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,11 @@ class RequestParams(CreateMessageRequestParams):
"""
Optional dictionary of template variables for dynamic templates. Currently only works for TensorZero inference backend
"""

delay_between_calls: float | None = None
"""
Optional delay between tool calls in seconds. This is useful for rate limiting as well as for working with tool calls that have delayed effects.

Example tool calls where this is helpful are tools with asynchronous effects, like sending emails or using a web browser. Web browser tools
may finish the tool calls already before all Ajax calls are finished, leading to problems if the LLM is too quick to continue processing.
"""
27 changes: 26 additions & 1 deletion src/mcp_agent/llm/augmented_llm.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import asyncio
import time
from abc import abstractmethod
from typing import (
TYPE_CHECKING,
Expand Down Expand Up @@ -158,6 +160,8 @@ def __init__(
# Initialize default parameters
self.default_request_params = self._initialize_default_params(kwargs)

self._last_call_timestamp: float | None = 0.0

# Apply model override if provided
if model:
self.default_request_params.model = model
Expand All @@ -171,6 +175,7 @@ def __init__(
self.type_converter = type_converter
self.verb = kwargs.get("verb")


def _initialize_default_params(self, kwargs: dict) -> RequestParams:
"""Initialize default parameters for the LLM.
Should be overridden by provider implementations to set provider-specific defaults."""
Expand All @@ -195,6 +200,9 @@ async def generate(
# We never expect this for structured() calls - this is for interactive use - developers
# can do this programatically
# TODO -- create a "fast-agent" control role rather than magic strings

final_params = self.get_request_params(request_params)
await self._apply_delay(final_params)

if multipart_messages[-1].first_text().startswith("***SAVE_HISTORY"):
parts: list[str] = multipart_messages[-1].first_text().split(" ", 1)
Expand Down Expand Up @@ -246,9 +254,12 @@ async def structured(
) -> Tuple[ModelT | None, PromptMessageMultipart]:
"""Return a structured response from the LLM using the provided messages."""

final_params = self.get_request_params(request_params)
await self._apply_delay(final_params)

self._precall(multipart_messages)
result, assistant_response = await self._apply_prompt_provider_specific_structured(
multipart_messages, model, request_params
multipart_messages, model, final_params
)

self._message_history.append(assistant_response)
Expand Down Expand Up @@ -337,6 +348,20 @@ def _precall(self, multipart_messages: List[PromptMessageMultipart]) -> None:
chat_turn=self.chat_turn(),
)

async def _apply_delay(self, request_params: RequestParams) -> None:
"""Checks and applies a delay if configured in request_params."""
if request_params.delay_between_calls and self._last_call_timestamp > 0:
required_delay = request_params.delay_between_calls
time_since_last_call = time.monotonic() - self._last_call_timestamp

if time_since_last_call < required_delay:
wait_time = required_delay - time_since_last_call
self.logger.debug(f"Applying delay: waiting for {wait_time:.2f} seconds.")
await asyncio.sleep(wait_time)

# Always update the timestamp for the next call
self._last_call_timestamp = time.monotonic()

def chat_turn(self) -> int:
"""Return the current chat turn number"""
return 1 + sum(1 for message in self._message_history if message.role == "assistant")
Expand Down
4 changes: 4 additions & 0 deletions src/mcp_agent/llm/providers/google_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,10 @@ def convert_from_google_content(
fast_agent_parts: List[
TextContent | ImageContent | EmbeddedResource | CallToolRequestParams
] = []

if content is None or not hasattr(content, 'parts') or content.parts is None:
return [] # Google API response 'content' object is None. Cannot extract parts.

for part in content.parts:
if part.text:
fast_agent_parts.append(TextContent(type="text", text=part.text))
Expand Down
Loading