From 1ab35ba25d4a65748182678376bbc8f19d51bfc8 Mon Sep 17 00:00:00 2001
From: konsisumer <der@konsi.org>
Date: Sun, 31 May 2026 18:13:27 +0200
Subject: [PATCH] fix(anthropic): stop SDK auto-retry double-firing and raise
 Retry-After cap to 600s
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Anthropic SDK clients were built without max_retries, so the SDK
default (max_retries=2) retried 429/5xx with its own backoff that ignores
Retry-After — double-retrying inside hermes's outer loop and burning
request slots against a bucket that won't refill for minutes. Set
max_retries=0 on all Anthropic/AnthropicBedrock client constructions so
the outer conversation loop (which already honors Retry-After) owns retry.

Also raise the Retry-After cap in the conversation loop from 120s to 600s.
Anthropic Tier 1 input-token buckets reset in ~171s, so the 120s cap made
hermes retry before the reset window and re-trip the limit.

Refs #26293
---
 agent/anthropic_adapter.py            | 12 +++++++
 agent/conversation_loop.py            |  7 +++-
 tests/agent/test_anthropic_adapter.py | 22 ++++++++++++
 tests/run_agent/test_run_agent.py     | 50 +++++++++++++++++++++++++++
 4 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/agent/anthropic_adapter.py b/agent/anthropic_adapter.py
index 0dbdbfa3bfb..e4d1d5ac125 100644
--- a/agent/anthropic_adapter.py
+++ b/agent/anthropic_adapter.py
@@ -673,6 +673,9 @@ def _build_anthropic_client_with_bearer_hook(
     kwargs = {
         "timeout": timeout_obj,
         "http_client": http_client,
+        # Delegate retry to hermes's outer loop (honors Retry-After); the SDK
+        # default max_retries=2 ignores it and double-retries. (#26293)
+        "max_retries": 0,
         # The SDK requires *something* for api_key/auth_token. Our
         # event hook overrides Authorization per request so this value
         # is never sent. The sentinel string makes accidental leaks
@@ -757,6 +760,12 @@ def build_anthropic_client(
     _read_timeout = timeout if (isinstance(timeout, (int, float)) and timeout > 0) else 900.0
     kwargs = {
         "timeout": Timeout(timeout=float(_read_timeout), connect=10.0),
+        # Delegate all rate-limit / 5xx retry to hermes's outer conversation
+        # loop, which honors Retry-After. The SDK default (max_retries=2) uses
+        # its own 1-2s backoff that ignores Retry-After and double-retries
+        # inside our loop — burning request slots against a bucket that won't
+        # refill for minutes. (#26293)
+        "max_retries": 0,
     }
     if normalized_base_url:
         # Azure Anthropic endpoints require an ``api-version`` query parameter.
@@ -852,6 +861,9 @@ def build_anthropic_bedrock_client(region: str):
     return _anthropic_sdk.AnthropicBedrock(
         aws_region=region,
         timeout=Timeout(timeout=900.0, connect=10.0),
+        # Delegate retry to hermes's outer loop (honors Retry-After); the SDK
+        # default max_retries=2 ignores it and double-retries. (#26293)
+        max_retries=0,
         default_headers={"anthropic-beta": ",".join([*_COMMON_BETAS, _CONTEXT_1M_BETA])},
     )
 
diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py
index 328aaee0474..983eec6df85 100644
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@@ -3688,7 +3688,12 @@ def run_conversation(
                         _ra_raw = _resp_headers.get("retry-after") or _resp_headers.get("Retry-After")
                         if _ra_raw:
                             try:
-                                _retry_after = min(float(_ra_raw), 120)  # Cap at 2 minutes
+                                # Cap at 10 minutes. Anthropic Tier 1 input-token
+                                # buckets reset in ~171s, so a 120s cap caused us to
+                                # retry before the actual reset window and re-trip the
+                                # limit. 600s covers all realistic provider reset
+                                # windows while still rejecting pathological values. (#26293)
+                                _retry_after = min(float(_ra_raw), 600)
                             except (TypeError, ValueError):
                                 pass
                 wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0)
diff --git a/tests/agent/test_anthropic_adapter.py b/tests/agent/test_anthropic_adapter.py
index 109793d2719..abf3e7e3ff6 100644
--- a/tests/agent/test_anthropic_adapter.py
+++ b/tests/agent/test_anthropic_adapter.py
@@ -201,6 +201,28 @@ class TestBuildAnthropicClient:
             betas = kwargs["default_headers"]["anthropic-beta"]
             assert "context-1m-2025-08-07" in betas
 
+    def test_disables_sdk_retries_for_api_key(self):
+        """#26293: the SDK's default max_retries=2 ignores Retry-After and
+        double-retries inside hermes's outer loop. We delegate retry entirely
+        to the outer loop, so the client must be built with max_retries=0."""
+        with patch("agent.anthropic_adapter._anthropic_sdk") as mock_sdk:
+            build_anthropic_client("sk-ant-api03-something")
+            kwargs = mock_sdk.Anthropic.call_args[1]
+            assert kwargs["max_retries"] == 0
+
+    def test_disables_sdk_retries_for_oauth_token(self):
+        with patch("agent.anthropic_adapter._anthropic_sdk") as mock_sdk:
+            build_anthropic_client("sk-ant-oat01-" + "x" * 60)
+            kwargs = mock_sdk.Anthropic.call_args[1]
+            assert kwargs["max_retries"] == 0
+
+    def test_bedrock_disables_sdk_retries(self):
+        with patch("agent.anthropic_adapter._anthropic_sdk") as mock_sdk:
+            mock_sdk.AnthropicBedrock = MagicMock()
+            build_anthropic_bedrock_client("us-east-1")
+            kwargs = mock_sdk.AnthropicBedrock.call_args[1]
+            assert kwargs["max_retries"] == 0
+
 
 class TestReadClaudeCodeCredentials:
     @pytest.fixture(autouse=True)
diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py
index 93475392d35..d22b539af6e 100644
--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@@ -2293,6 +2293,56 @@ class TestExecuteToolCalls:
         assert "Rate limit reached" not in output
 
 
+class TestRetryAfterCap:
+    """#26293: the conversation loop owns rate-limit backoff and honors the
+    Retry-After header up to a 600s ceiling (was 120s, which retried before
+    Tier-1 reset windows of ~171s and re-tripped the limit)."""
+
+    def _drive_once(self, agent, retry_after_value):
+        """Raise one 429 carrying ``Retry-After`` and capture the wait the loop
+        chose. Interrupt during the backoff sleep so the test doesn't actually
+        wait, and return the status string that reports the wait time."""
+
+        class _RateLimitError(Exception):
+            status_code = 429
+            response = SimpleNamespace(headers={"retry-after": str(retry_after_value)})
+
+            def __str__(self):
+                return "Error code: 429 - Rate limit exceeded."
+
+        def _fake_api_call(api_kwargs):
+            raise _RateLimitError()
+
+        agent._interruptible_api_call = _fake_api_call
+        agent._persist_session = lambda *args, **kwargs: None
+        agent._save_trajectory = lambda *args, **kwargs: None
+
+        captured = []
+        original_buffer = agent._buffer_status
+
+        def _capture_status(msg, *args, **kwargs):
+            captured.append(msg)
+            # Break out of the incremental backoff sleep immediately rather
+            # than blocking for the full Retry-After window.
+            if "Waiting" in msg:
+                agent._interrupt_requested = True
+            return original_buffer(msg, *args, **kwargs)
+
+        agent._buffer_status = _capture_status
+        agent.run_conversation("hello")
+        return next((m for m in captured if "Waiting" in m), "")
+
+    def test_retry_after_under_cap_is_honored(self, agent):
+        # 300s > old 120s cap but < new 600s cap → used verbatim.
+        status = self._drive_once(agent, 300)
+        assert "Waiting 300.0s" in status
+
+    def test_retry_after_above_cap_is_clamped_to_600s(self, agent):
+        # 900s exceeds the ceiling → clamped to 600s, not the old 120s.
+        status = self._drive_once(agent, 900)
+        assert "Waiting 600.0s" in status
+
+
 class TestConcurrentToolExecution:
     """Tests for _execute_tool_calls_concurrent and dispatch logic."""