From 0c372274cdb25094df9e2acc24059d1ece9c90a1 Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Sat, 27 Jun 2026 19:05:48 -0700 Subject: [PATCH] fix(agent): disable OpenAI SDK auto-retry that double-fires inside the rate-limit loop Same bug class as the Anthropic fix (#26293): the OpenAI/aggregator client is built without max_retries, so the SDK default of 2 applies. The SDK's own 1-2s backoff ignores Retry-After and retries inside hermes's outer conversation loop, burning request slots against a rate-limited bucket. Set max_retries=0 at the single create_openai_client chokepoint (covers init, switch_model, recovery, restore, request-scoped). auxiliary_client builds its own clients and is not wrapped by the loop, so it keeps SDK retries. --- agent/agent_runtime_helpers.py | 9 +++ ...eate_openai_client_disables_sdk_retries.py | 78 +++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 tests/run_agent/test_create_openai_client_disables_sdk_retries.py diff --git a/agent/agent_runtime_helpers.py b/agent/agent_runtime_helpers.py index e52f9dd7642..33d2620bec3 100644 --- a/agent/agent_runtime_helpers.py +++ b/agent/agent_runtime_helpers.py @@ -1452,6 +1452,15 @@ def create_openai_client(agent, client_kwargs: dict, *, reason: str, shared: boo keepalive_http = agent._build_keepalive_http_client(client_kwargs.get("base_url", "")) if keepalive_http is not None: client_kwargs["http_client"] = keepalive_http + # Delegate all rate-limit / 5xx retry to hermes's outer conversation loop, + # which honors Retry-After and applies adaptive/jittered backoff. The OpenAI + # SDK default (max_retries=2) uses its own 1-2s backoff that ignores + # Retry-After and double-retries inside our loop — the same deadlock the + # Anthropic clients hit (#26293). This is the single chokepoint every primary + # OpenAI/aggregator client passes through (init, switch_model, recovery, + # restore, request-scoped); auxiliary_client builds its own clients and keeps + # SDK retries because it is NOT wrapped by the conversation loop. + client_kwargs.setdefault("max_retries", 0) # Uses the module-level `OpenAI` name, resolved lazily on first # access via __getattr__ below. Tests patch via `run_agent.OpenAI`. client = _ra().OpenAI(**client_kwargs) diff --git a/tests/run_agent/test_create_openai_client_disables_sdk_retries.py b/tests/run_agent/test_create_openai_client_disables_sdk_retries.py new file mode 100644 index 00000000000..77335e17484 --- /dev/null +++ b/tests/run_agent/test_create_openai_client_disables_sdk_retries.py @@ -0,0 +1,78 @@ +"""Regression guard: _create_openai_client must disable SDK-level retries. + +#26293: the OpenAI SDK default ``max_retries=2`` uses its own 1-2s backoff that +ignores ``Retry-After`` and double-retries *inside* hermes's outer conversation +loop — burning request slots against a rate-limited bucket that won't refill for +minutes. The outer loop already owns rate-limit backoff (honors Retry-After, +adaptive + jittered), so every primary OpenAI/aggregator client must be built +with ``max_retries=0``. This is the OpenAI-path twin of the Anthropic adapter +fix in tests/agent/test_anthropic_adapter.py. +""" +from unittest.mock import MagicMock, patch + +from run_agent import AIAgent + + +@patch("run_agent.OpenAI") +def test_create_openai_client_disables_sdk_retries(mock_openai): + mock_openai.return_value = MagicMock() + agent = AIAgent( + api_key="test-key", + base_url="https://openrouter.ai/api/v1", + model="test/model", + quiet_mode=True, + skip_context_files=True, + skip_memory=True, + ) + + agent._create_openai_client( + {"api_key": "test-key", "base_url": "https://openrouter.ai/api/v1"}, + reason="test", + shared=False, + ) + + # Find the construction call that carries our base_url (AIAgent() also + # builds a client during init; the assertion targets the explicit call). + matching = [ + c for c in mock_openai.call_args_list + if c.kwargs.get("base_url") == "https://openrouter.ai/api/v1" + ] + assert matching, "OpenAI was never constructed with the expected base_url" + for call in matching: + assert call.kwargs.get("max_retries") == 0, ( + "_create_openai_client must set max_retries=0 so the SDK does not " + "double-retry inside the outer rate-limit loop (#26293); got " + f"{call.kwargs.get('max_retries')!r}" + ) + + +@patch("run_agent.OpenAI") +def test_create_openai_client_honors_explicit_max_retries(mock_openai): + """An explicit max_retries in client_kwargs is respected (setdefault, not + clobber) — future callers can opt back into SDK retries if needed.""" + mock_openai.return_value = MagicMock() + agent = AIAgent( + api_key="test-key", + base_url="https://openrouter.ai/api/v1", + model="test/model", + quiet_mode=True, + skip_context_files=True, + skip_memory=True, + ) + + agent._create_openai_client( + { + "api_key": "test-key", + "base_url": "https://explicit.example.com/v1", + "max_retries": 5, + }, + reason="test", + shared=False, + ) + + matching = [ + c for c in mock_openai.call_args_list + if c.kwargs.get("base_url") == "https://explicit.example.com/v1" + ] + assert matching, "OpenAI was never constructed with the explicit base_url" + assert matching[-1].kwargs.get("max_retries") == 5