From 1ab35ba25d4a65748182678376bbc8f19d51bfc8 Mon Sep 17 00:00:00 2001 From: konsisumer Date: Sun, 31 May 2026 18:13:27 +0200 Subject: [PATCH] fix(anthropic): stop SDK auto-retry double-firing and raise Retry-After cap to 600s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Anthropic SDK clients were built without max_retries, so the SDK default (max_retries=2) retried 429/5xx with its own backoff that ignores Retry-After — double-retrying inside hermes's outer loop and burning request slots against a bucket that won't refill for minutes. Set max_retries=0 on all Anthropic/AnthropicBedrock client constructions so the outer conversation loop (which already honors Retry-After) owns retry. Also raise the Retry-After cap in the conversation loop from 120s to 600s. Anthropic Tier 1 input-token buckets reset in ~171s, so the 120s cap made hermes retry before the reset window and re-trip the limit. Refs #26293 --- agent/anthropic_adapter.py | 12 +++++++ agent/conversation_loop.py | 7 +++- tests/agent/test_anthropic_adapter.py | 22 ++++++++++++ tests/run_agent/test_run_agent.py | 50 +++++++++++++++++++++++++++ 4 files changed, 90 insertions(+), 1 deletion(-) diff --git a/agent/anthropic_adapter.py b/agent/anthropic_adapter.py index 0dbdbfa3bfb..e4d1d5ac125 100644 --- a/agent/anthropic_adapter.py +++ b/agent/anthropic_adapter.py @@ -673,6 +673,9 @@ def _build_anthropic_client_with_bearer_hook( kwargs = { "timeout": timeout_obj, "http_client": http_client, + # Delegate retry to hermes's outer loop (honors Retry-After); the SDK + # default max_retries=2 ignores it and double-retries. (#26293) + "max_retries": 0, # The SDK requires *something* for api_key/auth_token. Our # event hook overrides Authorization per request so this value # is never sent. The sentinel string makes accidental leaks @@ -757,6 +760,12 @@ def build_anthropic_client( _read_timeout = timeout if (isinstance(timeout, (int, float)) and timeout > 0) else 900.0 kwargs = { "timeout": Timeout(timeout=float(_read_timeout), connect=10.0), + # Delegate all rate-limit / 5xx retry to hermes's outer conversation + # loop, which honors Retry-After. The SDK default (max_retries=2) uses + # its own 1-2s backoff that ignores Retry-After and double-retries + # inside our loop — burning request slots against a bucket that won't + # refill for minutes. (#26293) + "max_retries": 0, } if normalized_base_url: # Azure Anthropic endpoints require an ``api-version`` query parameter. @@ -852,6 +861,9 @@ def build_anthropic_bedrock_client(region: str): return _anthropic_sdk.AnthropicBedrock( aws_region=region, timeout=Timeout(timeout=900.0, connect=10.0), + # Delegate retry to hermes's outer loop (honors Retry-After); the SDK + # default max_retries=2 ignores it and double-retries. (#26293) + max_retries=0, default_headers={"anthropic-beta": ",".join([*_COMMON_BETAS, _CONTEXT_1M_BETA])}, ) diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index 328aaee0474..983eec6df85 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -3688,7 +3688,12 @@ def run_conversation( _ra_raw = _resp_headers.get("retry-after") or _resp_headers.get("Retry-After") if _ra_raw: try: - _retry_after = min(float(_ra_raw), 120) # Cap at 2 minutes + # Cap at 10 minutes. Anthropic Tier 1 input-token + # buckets reset in ~171s, so a 120s cap caused us to + # retry before the actual reset window and re-trip the + # limit. 600s covers all realistic provider reset + # windows while still rejecting pathological values. (#26293) + _retry_after = min(float(_ra_raw), 600) except (TypeError, ValueError): pass wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0) diff --git a/tests/agent/test_anthropic_adapter.py b/tests/agent/test_anthropic_adapter.py index 109793d2719..abf3e7e3ff6 100644 --- a/tests/agent/test_anthropic_adapter.py +++ b/tests/agent/test_anthropic_adapter.py @@ -201,6 +201,28 @@ class TestBuildAnthropicClient: betas = kwargs["default_headers"]["anthropic-beta"] assert "context-1m-2025-08-07" in betas + def test_disables_sdk_retries_for_api_key(self): + """#26293: the SDK's default max_retries=2 ignores Retry-After and + double-retries inside hermes's outer loop. We delegate retry entirely + to the outer loop, so the client must be built with max_retries=0.""" + with patch("agent.anthropic_adapter._anthropic_sdk") as mock_sdk: + build_anthropic_client("sk-ant-api03-something") + kwargs = mock_sdk.Anthropic.call_args[1] + assert kwargs["max_retries"] == 0 + + def test_disables_sdk_retries_for_oauth_token(self): + with patch("agent.anthropic_adapter._anthropic_sdk") as mock_sdk: + build_anthropic_client("sk-ant-oat01-" + "x" * 60) + kwargs = mock_sdk.Anthropic.call_args[1] + assert kwargs["max_retries"] == 0 + + def test_bedrock_disables_sdk_retries(self): + with patch("agent.anthropic_adapter._anthropic_sdk") as mock_sdk: + mock_sdk.AnthropicBedrock = MagicMock() + build_anthropic_bedrock_client("us-east-1") + kwargs = mock_sdk.AnthropicBedrock.call_args[1] + assert kwargs["max_retries"] == 0 + class TestReadClaudeCodeCredentials: @pytest.fixture(autouse=True) diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py index 93475392d35..d22b539af6e 100644 --- a/tests/run_agent/test_run_agent.py +++ b/tests/run_agent/test_run_agent.py @@ -2293,6 +2293,56 @@ class TestExecuteToolCalls: assert "Rate limit reached" not in output +class TestRetryAfterCap: + """#26293: the conversation loop owns rate-limit backoff and honors the + Retry-After header up to a 600s ceiling (was 120s, which retried before + Tier-1 reset windows of ~171s and re-tripped the limit).""" + + def _drive_once(self, agent, retry_after_value): + """Raise one 429 carrying ``Retry-After`` and capture the wait the loop + chose. Interrupt during the backoff sleep so the test doesn't actually + wait, and return the status string that reports the wait time.""" + + class _RateLimitError(Exception): + status_code = 429 + response = SimpleNamespace(headers={"retry-after": str(retry_after_value)}) + + def __str__(self): + return "Error code: 429 - Rate limit exceeded." + + def _fake_api_call(api_kwargs): + raise _RateLimitError() + + agent._interruptible_api_call = _fake_api_call + agent._persist_session = lambda *args, **kwargs: None + agent._save_trajectory = lambda *args, **kwargs: None + + captured = [] + original_buffer = agent._buffer_status + + def _capture_status(msg, *args, **kwargs): + captured.append(msg) + # Break out of the incremental backoff sleep immediately rather + # than blocking for the full Retry-After window. + if "Waiting" in msg: + agent._interrupt_requested = True + return original_buffer(msg, *args, **kwargs) + + agent._buffer_status = _capture_status + agent.run_conversation("hello") + return next((m for m in captured if "Waiting" in m), "") + + def test_retry_after_under_cap_is_honored(self, agent): + # 300s > old 120s cap but < new 600s cap → used verbatim. + status = self._drive_once(agent, 300) + assert "Waiting 300.0s" in status + + def test_retry_after_above_cap_is_clamped_to_600s(self, agent): + # 900s exceeds the ceiling → clamped to 600s, not the old 120s. + status = self._drive_once(agent, 900) + assert "Waiting 600.0s" in status + + class TestConcurrentToolExecution: """Tests for _execute_tool_calls_concurrent and dispatch logic."""