mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-30 11:52:04 +00:00
fix(anthropic): stop SDK auto-retry double-firing and raise Retry-After cap to 600s
The Anthropic SDK clients were built without max_retries, so the SDK default (max_retries=2) retried 429/5xx with its own backoff that ignores Retry-After — double-retrying inside hermes's outer loop and burning request slots against a bucket that won't refill for minutes. Set max_retries=0 on all Anthropic/AnthropicBedrock client constructions so the outer conversation loop (which already honors Retry-After) owns retry. Also raise the Retry-After cap in the conversation loop from 120s to 600s. Anthropic Tier 1 input-token buckets reset in ~171s, so the 120s cap made hermes retry before the reset window and re-trip the limit. Refs #26293
This commit is contained in:
parent
32732a8f83
commit
1ab35ba25d
4 changed files with 90 additions and 1 deletions
|
|
@ -673,6 +673,9 @@ def _build_anthropic_client_with_bearer_hook(
|
|||
kwargs = {
|
||||
"timeout": timeout_obj,
|
||||
"http_client": http_client,
|
||||
# Delegate retry to hermes's outer loop (honors Retry-After); the SDK
|
||||
# default max_retries=2 ignores it and double-retries. (#26293)
|
||||
"max_retries": 0,
|
||||
# The SDK requires *something* for api_key/auth_token. Our
|
||||
# event hook overrides Authorization per request so this value
|
||||
# is never sent. The sentinel string makes accidental leaks
|
||||
|
|
@ -757,6 +760,12 @@ def build_anthropic_client(
|
|||
_read_timeout = timeout if (isinstance(timeout, (int, float)) and timeout > 0) else 900.0
|
||||
kwargs = {
|
||||
"timeout": Timeout(timeout=float(_read_timeout), connect=10.0),
|
||||
# Delegate all rate-limit / 5xx retry to hermes's outer conversation
|
||||
# loop, which honors Retry-After. The SDK default (max_retries=2) uses
|
||||
# its own 1-2s backoff that ignores Retry-After and double-retries
|
||||
# inside our loop — burning request slots against a bucket that won't
|
||||
# refill for minutes. (#26293)
|
||||
"max_retries": 0,
|
||||
}
|
||||
if normalized_base_url:
|
||||
# Azure Anthropic endpoints require an ``api-version`` query parameter.
|
||||
|
|
@ -852,6 +861,9 @@ def build_anthropic_bedrock_client(region: str):
|
|||
return _anthropic_sdk.AnthropicBedrock(
|
||||
aws_region=region,
|
||||
timeout=Timeout(timeout=900.0, connect=10.0),
|
||||
# Delegate retry to hermes's outer loop (honors Retry-After); the SDK
|
||||
# default max_retries=2 ignores it and double-retries. (#26293)
|
||||
max_retries=0,
|
||||
default_headers={"anthropic-beta": ",".join([*_COMMON_BETAS, _CONTEXT_1M_BETA])},
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -3688,7 +3688,12 @@ def run_conversation(
|
|||
_ra_raw = _resp_headers.get("retry-after") or _resp_headers.get("Retry-After")
|
||||
if _ra_raw:
|
||||
try:
|
||||
_retry_after = min(float(_ra_raw), 120) # Cap at 2 minutes
|
||||
# Cap at 10 minutes. Anthropic Tier 1 input-token
|
||||
# buckets reset in ~171s, so a 120s cap caused us to
|
||||
# retry before the actual reset window and re-trip the
|
||||
# limit. 600s covers all realistic provider reset
|
||||
# windows while still rejecting pathological values. (#26293)
|
||||
_retry_after = min(float(_ra_raw), 600)
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
wait_time = _retry_after if _retry_after else jittered_backoff(retry_count, base_delay=2.0, max_delay=60.0)
|
||||
|
|
|
|||
|
|
@ -201,6 +201,28 @@ class TestBuildAnthropicClient:
|
|||
betas = kwargs["default_headers"]["anthropic-beta"]
|
||||
assert "context-1m-2025-08-07" in betas
|
||||
|
||||
def test_disables_sdk_retries_for_api_key(self):
|
||||
"""#26293: the SDK's default max_retries=2 ignores Retry-After and
|
||||
double-retries inside hermes's outer loop. We delegate retry entirely
|
||||
to the outer loop, so the client must be built with max_retries=0."""
|
||||
with patch("agent.anthropic_adapter._anthropic_sdk") as mock_sdk:
|
||||
build_anthropic_client("sk-ant-api03-something")
|
||||
kwargs = mock_sdk.Anthropic.call_args[1]
|
||||
assert kwargs["max_retries"] == 0
|
||||
|
||||
def test_disables_sdk_retries_for_oauth_token(self):
|
||||
with patch("agent.anthropic_adapter._anthropic_sdk") as mock_sdk:
|
||||
build_anthropic_client("sk-ant-oat01-" + "x" * 60)
|
||||
kwargs = mock_sdk.Anthropic.call_args[1]
|
||||
assert kwargs["max_retries"] == 0
|
||||
|
||||
def test_bedrock_disables_sdk_retries(self):
|
||||
with patch("agent.anthropic_adapter._anthropic_sdk") as mock_sdk:
|
||||
mock_sdk.AnthropicBedrock = MagicMock()
|
||||
build_anthropic_bedrock_client("us-east-1")
|
||||
kwargs = mock_sdk.AnthropicBedrock.call_args[1]
|
||||
assert kwargs["max_retries"] == 0
|
||||
|
||||
|
||||
class TestReadClaudeCodeCredentials:
|
||||
@pytest.fixture(autouse=True)
|
||||
|
|
|
|||
|
|
@ -2293,6 +2293,56 @@ class TestExecuteToolCalls:
|
|||
assert "Rate limit reached" not in output
|
||||
|
||||
|
||||
class TestRetryAfterCap:
|
||||
"""#26293: the conversation loop owns rate-limit backoff and honors the
|
||||
Retry-After header up to a 600s ceiling (was 120s, which retried before
|
||||
Tier-1 reset windows of ~171s and re-tripped the limit)."""
|
||||
|
||||
def _drive_once(self, agent, retry_after_value):
|
||||
"""Raise one 429 carrying ``Retry-After`` and capture the wait the loop
|
||||
chose. Interrupt during the backoff sleep so the test doesn't actually
|
||||
wait, and return the status string that reports the wait time."""
|
||||
|
||||
class _RateLimitError(Exception):
|
||||
status_code = 429
|
||||
response = SimpleNamespace(headers={"retry-after": str(retry_after_value)})
|
||||
|
||||
def __str__(self):
|
||||
return "Error code: 429 - Rate limit exceeded."
|
||||
|
||||
def _fake_api_call(api_kwargs):
|
||||
raise _RateLimitError()
|
||||
|
||||
agent._interruptible_api_call = _fake_api_call
|
||||
agent._persist_session = lambda *args, **kwargs: None
|
||||
agent._save_trajectory = lambda *args, **kwargs: None
|
||||
|
||||
captured = []
|
||||
original_buffer = agent._buffer_status
|
||||
|
||||
def _capture_status(msg, *args, **kwargs):
|
||||
captured.append(msg)
|
||||
# Break out of the incremental backoff sleep immediately rather
|
||||
# than blocking for the full Retry-After window.
|
||||
if "Waiting" in msg:
|
||||
agent._interrupt_requested = True
|
||||
return original_buffer(msg, *args, **kwargs)
|
||||
|
||||
agent._buffer_status = _capture_status
|
||||
agent.run_conversation("hello")
|
||||
return next((m for m in captured if "Waiting" in m), "")
|
||||
|
||||
def test_retry_after_under_cap_is_honored(self, agent):
|
||||
# 300s > old 120s cap but < new 600s cap → used verbatim.
|
||||
status = self._drive_once(agent, 300)
|
||||
assert "Waiting 300.0s" in status
|
||||
|
||||
def test_retry_after_above_cap_is_clamped_to_600s(self, agent):
|
||||
# 900s exceeds the ceiling → clamped to 600s, not the old 120s.
|
||||
status = self._drive_once(agent, 900)
|
||||
assert "Waiting 600.0s" in status
|
||||
|
||||
|
||||
class TestConcurrentToolExecution:
|
||||
"""Tests for _execute_tool_calls_concurrent and dispatch logic."""
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue