fix(anthropic): stop SDK auto-retry double-firing and raise Retry-After cap to 600s

The Anthropic SDK clients were built without max_retries, so the SDK default (max_retries=2) retried 429/5xx with its own backoff that ignores Retry-After — double-retrying inside hermes's outer loop and burning request slots against a bucket that won't refill for minutes. Set max_retries=0 on all Anthropic/AnthropicBedrock client constructions so the outer conversation loop (which already honors Retry-After) owns retry. Also raise the Retry-After cap in the conversation loop from 120s to 600s. Anthropic Tier 1 input-token buckets reset in ~171s, so the 120s cap made hermes retry before the reset window and re-trip the limit. Refs #26293
2026-06-30 11:52:04 +00:00 · 2026-05-31 18:13:27 +02:00 · 2026-05-31 18:13:27 +02:00 · 1ab35ba25d
commit 1ab35ba25d
parent 32732a8f83
4 changed files with 90 additions and 1 deletions
--- a/agent/anthropic_adapter.py
+++ b/agent/anthropic_adapter.py
@ -673,6 +673,9 @@ def _build_anthropic_client_with_bearer_hook(
    kwargs = {
        "timeout": timeout_obj,
        "http_client": http_client,
+        # Delegate retry to hermes's outer loop (honors Retry-After); the SDK
+        # default max_retries=2 ignores it and double-retries. (#26293)
+        "max_retries": 0,
        # The SDK requires *something* for api_key/auth_token. Our
        # event hook overrides Authorization per request so this value
        # is never sent. The sentinel string makes accidental leaks
@ -757,6 +760,12 @@ def build_anthropic_client(
    _read_timeout = timeout if (isinstance(timeout, (int, float)) and timeout > 0) else 900.0
    kwargs = {
        "timeout": Timeout(timeout=float(_read_timeout), connect=10.0),
+        # Delegate all rate-limit / 5xx retry to hermes's outer conversation
+        # loop, which honors Retry-After. The SDK default (max_retries=2) uses
+        # its own 1-2s backoff that ignores Retry-After and double-retries
+        # inside our loop — burning request slots against a bucket that won't
+        # refill for minutes. (#26293)
+        "max_retries": 0,
    }
    if normalized_base_url:
        # Azure Anthropic endpoints require an ``api-version`` query parameter.
@ -852,6 +861,9 @@ def build_anthropic_bedrock_client(region: str):
    return _anthropic_sdk.AnthropicBedrock(
        aws_region=region,
        timeout=Timeout(timeout=900.0, connect=10.0),
+        # Delegate retry to hermes's outer loop (honors Retry-After); the SDK
+        # default max_retries=2 ignores it and double-retries. (#26293)
+        max_retries=0,
        default_headers={"anthropic-beta": ",".join([*_COMMON_BETAS, _CONTEXT_1M_BETA])},
    )

--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@ -3688,7 +3688,12 @@ def run_conversation(
                        _ra_raw = _resp_headers.get("retry-after") or _resp_headers.get("Retry-After")
                        if _ra_raw:
                            try:
-                                _retry_after = min(float(_ra_raw), 120)  # Cap at 2 minutes
+                                # Cap at 10 minutes. Anthropic Tier 1 input-token
+                                # buckets reset in ~171s, so a 120s cap caused us to
+                                # retry before the actual reset window and re-trip the
+                                # limit. 600s covers all realistic provider reset
+                                # windows while still rejecting pathological values. (#26293)
+                                _retry_after = min(float(_ra_raw), 600)
                            except (TypeError, ValueError):
                                pass
                wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0)
--- a/tests/agent/test_anthropic_adapter.py
+++ b/tests/agent/test_anthropic_adapter.py
@ -201,6 +201,28 @@ class TestBuildAnthropicClient:
            betas = kwargs["default_headers"]["anthropic-beta"]
            assert "context-1m-2025-08-07" in betas

+    def test_disables_sdk_retries_for_api_key(self):
+        """#26293: the SDK's default max_retries=2 ignores Retry-After and
+        double-retries inside hermes's outer loop. We delegate retry entirely
+        to the outer loop, so the client must be built with max_retries=0."""
+        with patch("agent.anthropic_adapter._anthropic_sdk") as mock_sdk:
+            build_anthropic_client("sk-ant-api03-something")
+            kwargs = mock_sdk.Anthropic.call_args[1]
+            assert kwargs["max_retries"] == 0
+
+    def test_disables_sdk_retries_for_oauth_token(self):
+        with patch("agent.anthropic_adapter._anthropic_sdk") as mock_sdk:
+            build_anthropic_client("sk-ant-oat01-" + "x" * 60)
+            kwargs = mock_sdk.Anthropic.call_args[1]
+            assert kwargs["max_retries"] == 0
+
+    def test_bedrock_disables_sdk_retries(self):
+        with patch("agent.anthropic_adapter._anthropic_sdk") as mock_sdk:
+            mock_sdk.AnthropicBedrock = MagicMock()
+            build_anthropic_bedrock_client("us-east-1")
+            kwargs = mock_sdk.AnthropicBedrock.call_args[1]
+            assert kwargs["max_retries"] == 0
+

 class TestReadClaudeCodeCredentials:
    @pytest.fixture(autouse=True)
--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@ -2293,6 +2293,56 @@ class TestExecuteToolCalls:
        assert "Rate limit reached" not in output


+class TestRetryAfterCap:
+    """#26293: the conversation loop owns rate-limit backoff and honors the
+    Retry-After header up to a 600s ceiling (was 120s, which retried before
+    Tier-1 reset windows of ~171s and re-tripped the limit)."""
+
+    def _drive_once(self, agent, retry_after_value):
+        """Raise one 429 carrying ``Retry-After`` and capture the wait the loop
+        chose. Interrupt during the backoff sleep so the test doesn't actually
+        wait, and return the status string that reports the wait time."""
+
+        class _RateLimitError(Exception):
+            status_code = 429
+            response = SimpleNamespace(headers={"retry-after": str(retry_after_value)})
+
+            def __str__(self):
+                return "Error code: 429 - Rate limit exceeded."
+
+        def _fake_api_call(api_kwargs):
+            raise _RateLimitError()
+
+        agent._interruptible_api_call = _fake_api_call
+        agent._persist_session = lambda *args, **kwargs: None
+        agent._save_trajectory = lambda *args, **kwargs: None
+
+        captured = []
+        original_buffer = agent._buffer_status
+
+        def _capture_status(msg, *args, **kwargs):
+            captured.append(msg)
+            # Break out of the incremental backoff sleep immediately rather
+            # than blocking for the full Retry-After window.
+            if "Waiting" in msg:
+                agent._interrupt_requested = True
+            return original_buffer(msg, *args, **kwargs)
+
+        agent._buffer_status = _capture_status
+        agent.run_conversation("hello")
+        return next((m for m in captured if "Waiting" in m), "")
+
+    def test_retry_after_under_cap_is_honored(self, agent):
+        # 300s > old 120s cap but < new 600s cap → used verbatim.
+        status = self._drive_once(agent, 300)
+        assert "Waiting 300.0s" in status
+
+    def test_retry_after_above_cap_is_clamped_to_600s(self, agent):
+        # 900s exceeds the ceiling → clamped to 600s, not the old 120s.
+        status = self._drive_once(agent, 900)
+        assert "Waiting 600.0s" in status
+
+
 class TestConcurrentToolExecution:
    """Tests for _execute_tool_calls_concurrent and dispatch logic."""