fix: collapse multiplicative 429 retry explosion (45→15)

Cell · Cell · commit 5acff18c958e · 2026-05-03T11:42:39.000-07:00
When a RetryingAsyncClient is used as http_client for the OpenAI SDK,
the SDK's built-in retries (max_retries=2) and our own retries
(max_retries=5) both independently retry 429s, multiplying with the
streaming retry layer (3 attempts) to produce up to 45 retries per
rate-limit event.

Add disable_openai_sdk_retries() helper that constructs an
AsyncOpenAI(max_retries=0) and returns it as openai_client when a
RetryingAsyncClient is detected, collapsing the 3-layer nesting to 2:

  Before: 3 streaming × 3 SDK × 5 HTTP = 45 retries
  After:  3 streaming × 5 HTTP = 15 retries

Applies to custom_openai, cerebras, and copilot_auth providers.
Falls back to http_client mode with a warning if AsyncOpenAI
construction fails (e.g. missing api_key).
diff --git a/code_puppy/http_utils.py b/code_puppy/http_utils.py
@@ -5,6 +5,7 @@
 """
 
 import asyncio
+import logging
 import os
 import socket
 import time
@@ -17,6 +18,8 @@
     import requests
 from code_puppy.config import get_http2
 
+logger = logging.getLogger(__name__)
+
 
 @dataclass
 class ProxyConfig:
@@ -340,6 +343,63 @@ def create_reopenable_async_client(
             return httpx.AsyncClient(**base_kwargs)
 
 
+def disable_openai_sdk_retries(
+    http_client: httpx.AsyncClient,
+    **openai_kwargs: Any,
+) -> dict:
+    """When a RetryingAsyncClient is used as http_client for the OpenAI SDK,
+    disable the SDK's own retries to avoid multiplicative retry explosion.
+
+    The OpenAI SDK defaults to max_retries=2, and RetryingAsyncClient has 5.
+    Together with 3 streaming retries, a 429 can trigger up to
+    3 x 3 x 5 = 45 retries. Disabling SDK retries caps this at 3 x 5 = 15.
+
+    Returns provider kwargs. If the client is NOT a RetryingAsyncClient,
+    returns {"http_client": client} (+ any openai_kwargs as separate keys).
+    If it IS a RetryingAsyncClient, returns {"openai_client": AsyncOpenAI(...)}
+    with max_retries=0 and the provided openai_kwargs folded in.
+    Falls back to {"http_client": client} if AsyncOpenAI construction fails
+    (e.g. missing api_key).
+
+    Args:
+        http_client: The httpx client (possibly RetryingAsyncClient).
+        **openai_kwargs: Extra kwargs for AsyncOpenAI (api_key, base_url, etc).
+            Only used when creating an openai_client to bypass SDK retries.
+    """
+    if isinstance(http_client, RetryingAsyncClient):
+        try:
+            from openai import AsyncOpenAI
+
+            openai_client = AsyncOpenAI(
+                http_client=http_client,
+                max_retries=0,
+                **openai_kwargs,
+            )
+            return {"openai_client": openai_client}
+        except ImportError:
+            # openai package not installed; fall through
+            pass
+        except Exception as exc:
+            # Missing api_key (OpenAIError), wrong kwargs (TypeError),
+            # or other construction failures — fall back gracefully.
+            try:
+                from openai import OpenAIError as _OpenAIError
+
+                _warnable = (TypeError, ValueError, _OpenAIError)
+            except ImportError:
+                _warnable = (TypeError, ValueError)
+            if isinstance(exc, _warnable):
+                emit_warning(
+                    f"Could not disable OpenAI SDK retries ({exc}). "
+                    f"Falling back to http_client mode — multiplicative retries possible."
+                )
+            else:  # pragma: no cover
+                logger.debug("Unexpected error disabling OpenAI SDK retries: %s", exc)
+    result = {"http_client": http_client}
+    result.update(openai_kwargs)
+    return result
+
+
 def is_cert_bundle_available() -> bool:
     cert_path = get_cert_bundle_path()
     if cert_path is None:
diff --git a/code_puppy/model_factory.py b/code_puppy/model_factory.py
@@ -25,7 +25,12 @@
 from . import callbacks
 from .claude_cache_client import ClaudeCacheAsyncClient, patch_anthropic_client_messages
 from .config import EXTRA_MODELS_FILE, get_value, get_yolo_mode
-from .http_utils import create_async_client, get_cert_bundle_path, get_http2
+from .http_utils import (
+    create_async_client,
+    disable_openai_sdk_retries,
+    get_cert_bundle_path,
+    get_http2,
+)
 from .provider_identity import (
     make_anthropic_provider,
     make_openai_provider,
@@ -700,11 +705,20 @@ def get_model(model_name: str, config: Dict[str, Any]) -> Any:
                 verify=verify,
                 timeout=timeout if timeout is not None else 180,
             )
-            provider_args = {"base_url": url}
             if isinstance(client, httpx.AsyncClient):
-                provider_args["http_client"] = client
-            if api_key:
-                provider_args["api_key"] = api_key
+                # Disable OpenAI SDK retries when using our own
+                # RetryingAsyncClient to avoid multiplicative retry explosion
+                # (3 streaming x 3 SDK x 5 HTTP = 45 retries on 429)
+                openai_kwargs = {}
+                if url:
+                    openai_kwargs["base_url"] = url
+                if api_key:
+                    openai_kwargs["api_key"] = api_key
+                provider_args = disable_openai_sdk_retries(client, **openai_kwargs)
+            else:
+                provider_args = {"base_url": url}
+                if api_key:
+                    provider_args["api_key"] = api_key
             provider = make_openai_provider(provider_identity, **provider_args)
             model = OpenAIChatModel(model_name=model_config["name"], provider=provider)
             if model_name == "chatgpt-gpt-5-codex":
@@ -791,10 +805,10 @@ def model_profile(self, model_name: str) -> ModelProfile | None:
                 model_name="cerebras",
                 timeout=timeout if timeout is not None else 180,
             )
-            provider_args = dict(
-                api_key=api_key,
-                http_client=client,
-            )
+            # Disable OpenAI SDK retries when using our own
+            # RetryingAsyncClient to avoid multiplicative retry explosion
+            # (3 streaming x 3 SDK x 5 HTTP = 45 retries on 429)
+            provider_args = disable_openai_sdk_retries(client, api_key=api_key)
             provider = ZaiCerebrasProvider(**provider_args)
 
             return OpenAIChatModel(model_name=model_config["name"], provider=provider)
diff --git a/code_puppy/plugins/copilot_auth/register_callbacks.py b/code_puppy/plugins/copilot_auth/register_callbacks.py
@@ -358,7 +358,7 @@ def _create_copilot_model(model_name: str, model_config: Dict, config: Dict) ->
     from pydantic_ai.models.openai import OpenAIChatModel
     from pydantic_ai.providers.openai import OpenAIProvider
 
-    from code_puppy.http_utils import create_async_client
+    from code_puppy.http_utils import create_async_client, disable_openai_sdk_retries
 
     # Discover token — match against the host stored in the model config
     host = model_config.get("copilot_host", "github.com")
@@ -414,12 +414,14 @@ def auth_flow(self, request: httpx.Request):
         if config_url:
             base_url = config_url
 
-    # Use a placeholder API key — the actual token is injected by _CopilotAuth
-    provider = OpenAIProvider(
+    # Disable OpenAI SDK retries when using our own RetryingAsyncClient
+    # to avoid multiplicative retry explosion (3 streaming x 3 SDK x 5 HTTP = 45)
+    provider_kwargs = disable_openai_sdk_retries(
+        client,
         api_key="copilot-session-managed",
         base_url=base_url,
-        http_client=client,
     )
+    provider = OpenAIProvider(**provider_kwargs)
 
     # Build a model profile that tells pydantic-ai how to handle thinking.
     # Claude models behind the Copilot API return thinking in a custom field
diff --git a/tests/test_http_utils.py b/tests/test_http_utils.py
@@ -12,6 +12,8 @@
 import os
 from unittest.mock import patch
 
+import httpx
+
 from code_puppy.http_utils import ProxyConfig
 
 
@@ -370,3 +372,60 @@ def test_find_available_port_multiple_calls(self):
         # Both should be valid ports
         assert isinstance(port1, int) and isinstance(port2, int)
         assert port1 > 0 and port2 > 0
+
+
+class TestDisableOpenAISdkRetries:
+    """Test disable_openai_sdk_retries helper."""
+
+    def test_plain_client_returns_http_client(self):
+        """Plain httpx.AsyncClient should just pass through."""
+        from code_puppy.http_utils import disable_openai_sdk_retries
+
+        client = httpx.AsyncClient()
+        result = disable_openai_sdk_retries(client)
+        assert result == {"http_client": client}
+
+    def test_plain_client_passes_openai_kwargs(self):
+        """openai_kwargs should be added as separate keys for plain clients."""
+        from code_puppy.http_utils import disable_openai_sdk_retries
+
+        client = httpx.AsyncClient()
+        result = disable_openai_sdk_retries(
+            client, api_key="test-key", base_url="https://example.com"
+        )
+        assert result["http_client"] is client
+        assert result["api_key"] == "test-key"
+        assert result["base_url"] == "https://example.com"
+
+    def test_retrying_client_creates_openai_client(self):
+        """RetryingAsyncClient should produce an openai_client with max_retries=0."""
+        from code_puppy.http_utils import (
+            RetryingAsyncClient,
+            disable_openai_sdk_retries,
+        )
+
+        client = RetryingAsyncClient(max_retries=5)
+        result = disable_openai_sdk_retries(
+            client, api_key="test-key", base_url="https://example.com"
+        )
+        assert "openai_client" in result
+        assert "http_client" not in result  # replaced by openai_client
+        assert result["openai_client"].max_retries == 0
+
+    def test_retrying_client_falls_back_on_missing_api_key(self):
+        """If AsyncOpenAI creation fails, fall back to http_client."""
+        from code_puppy.http_utils import (
+            RetryingAsyncClient,
+            disable_openai_sdk_retries,
+        )
+
+        client = RetryingAsyncClient(max_retries=5)
+        # No api_key and no OPENAI_API_KEY env var → AsyncOpenAI will fail
+        with patch.dict(os.environ, {}, clear=True):
+            with patch("code_puppy.http_utils.emit_warning") as mock_warn:
+                result = disable_openai_sdk_retries(client)
+        assert "http_client" in result
+        assert result["http_client"] is client
+        # Should have warned about falling back
+        mock_warn.assert_called_once()
+        assert "multiplicative" in mock_warn.call_args[0][0].lower()