fix(anthropic): stop SDK auto-retry double-firing and raise Retry-After cap to 600s

The Anthropic SDK clients were built without max_retries, so the SDK
default (max_retries=2) retried 429/5xx with its own backoff that ignores
Retry-After — double-retrying inside hermes's outer loop and burning
request slots against a bucket that won't refill for minutes. Set
max_retries=0 on all Anthropic/AnthropicBedrock client constructions so
the outer conversation loop (which already honors Retry-After) owns retry.

Also raise the Retry-After cap in the conversation loop from 120s to 600s.
Anthropic Tier 1 input-token buckets reset in ~171s, so the 120s cap made
hermes retry before the reset window and re-trip the limit.

Refs #26293
This commit is contained in:
konsisumer 2026-05-31 18:13:27 +02:00 committed by Teknium
parent 32732a8f83
commit 1ab35ba25d
4 changed files with 90 additions and 1 deletions

View file

@ -673,6 +673,9 @@ def _build_anthropic_client_with_bearer_hook(
kwargs = {
"timeout": timeout_obj,
"http_client": http_client,
# Delegate retry to hermes's outer loop (honors Retry-After); the SDK
# default max_retries=2 ignores it and double-retries. (#26293)
"max_retries": 0,
# The SDK requires *something* for api_key/auth_token. Our
# event hook overrides Authorization per request so this value
# is never sent. The sentinel string makes accidental leaks
@ -757,6 +760,12 @@ def build_anthropic_client(
_read_timeout = timeout if (isinstance(timeout, (int, float)) and timeout > 0) else 900.0
kwargs = {
"timeout": Timeout(timeout=float(_read_timeout), connect=10.0),
# Delegate all rate-limit / 5xx retry to hermes's outer conversation
# loop, which honors Retry-After. The SDK default (max_retries=2) uses
# its own 1-2s backoff that ignores Retry-After and double-retries
# inside our loop — burning request slots against a bucket that won't
# refill for minutes. (#26293)
"max_retries": 0,
}
if normalized_base_url:
# Azure Anthropic endpoints require an ``api-version`` query parameter.
@ -852,6 +861,9 @@ def build_anthropic_bedrock_client(region: str):
return _anthropic_sdk.AnthropicBedrock(
aws_region=region,
timeout=Timeout(timeout=900.0, connect=10.0),
# Delegate retry to hermes's outer loop (honors Retry-After); the SDK
# default max_retries=2 ignores it and double-retries. (#26293)
max_retries=0,
default_headers={"anthropic-beta": ",".join([*_COMMON_BETAS, _CONTEXT_1M_BETA])},
)

View file

@ -3688,7 +3688,12 @@ def run_conversation(
_ra_raw = _resp_headers.get("retry-after") or _resp_headers.get("Retry-After")
if _ra_raw:
try:
_retry_after = min(float(_ra_raw), 120) # Cap at 2 minutes
# Cap at 10 minutes. Anthropic Tier 1 input-token
# buckets reset in ~171s, so a 120s cap caused us to
# retry before the actual reset window and re-trip the
# limit. 600s covers all realistic provider reset
# windows while still rejecting pathological values. (#26293)
_retry_after = min(float(_ra_raw), 600)
except (TypeError, ValueError):
pass
wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0)

View file

@ -201,6 +201,28 @@ class TestBuildAnthropicClient:
betas = kwargs["default_headers"]["anthropic-beta"]
assert "context-1m-2025-08-07" in betas
def test_disables_sdk_retries_for_api_key(self):
"""#26293: the SDK's default max_retries=2 ignores Retry-After and
double-retries inside hermes's outer loop. We delegate retry entirely
to the outer loop, so the client must be built with max_retries=0."""
with patch("agent.anthropic_adapter._anthropic_sdk") as mock_sdk:
build_anthropic_client("sk-ant-api03-something")
kwargs = mock_sdk.Anthropic.call_args[1]
assert kwargs["max_retries"] == 0
def test_disables_sdk_retries_for_oauth_token(self):
with patch("agent.anthropic_adapter._anthropic_sdk") as mock_sdk:
build_anthropic_client("sk-ant-oat01-" + "x" * 60)
kwargs = mock_sdk.Anthropic.call_args[1]
assert kwargs["max_retries"] == 0
def test_bedrock_disables_sdk_retries(self):
with patch("agent.anthropic_adapter._anthropic_sdk") as mock_sdk:
mock_sdk.AnthropicBedrock = MagicMock()
build_anthropic_bedrock_client("us-east-1")
kwargs = mock_sdk.AnthropicBedrock.call_args[1]
assert kwargs["max_retries"] == 0
class TestReadClaudeCodeCredentials:
@pytest.fixture(autouse=True)

View file

@ -2293,6 +2293,56 @@ class TestExecuteToolCalls:
assert "Rate limit reached" not in output
class TestRetryAfterCap:
"""#26293: the conversation loop owns rate-limit backoff and honors the
Retry-After header up to a 600s ceiling (was 120s, which retried before
Tier-1 reset windows of ~171s and re-tripped the limit)."""
def _drive_once(self, agent, retry_after_value):
"""Raise one 429 carrying ``Retry-After`` and capture the wait the loop
chose. Interrupt during the backoff sleep so the test doesn't actually
wait, and return the status string that reports the wait time."""
class _RateLimitError(Exception):
status_code = 429
response = SimpleNamespace(headers={"retry-after": str(retry_after_value)})
def __str__(self):
return "Error code: 429 - Rate limit exceeded."
def _fake_api_call(api_kwargs):
raise _RateLimitError()
agent._interruptible_api_call = _fake_api_call
agent._persist_session = lambda *args, **kwargs: None
agent._save_trajectory = lambda *args, **kwargs: None
captured = []
original_buffer = agent._buffer_status
def _capture_status(msg, *args, **kwargs):
captured.append(msg)
# Break out of the incremental backoff sleep immediately rather
# than blocking for the full Retry-After window.
if "Waiting" in msg:
agent._interrupt_requested = True
return original_buffer(msg, *args, **kwargs)
agent._buffer_status = _capture_status
agent.run_conversation("hello")
return next((m for m in captured if "Waiting" in m), "")
def test_retry_after_under_cap_is_honored(self, agent):
# 300s > old 120s cap but < new 600s cap → used verbatim.
status = self._drive_once(agent, 300)
assert "Waiting 300.0s" in status
def test_retry_after_above_cap_is_clamped_to_600s(self, agent):
# 900s exceeds the ceiling → clamped to 600s, not the old 120s.
status = self._drive_once(agent, 900)
assert "Waiting 600.0s" in status
class TestConcurrentToolExecution:
"""Tests for _execute_tool_calls_concurrent and dispatch logic."""