mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
fix(error_classifier): classify generic-typed timeout messages as transient (carve-out of #22664)
RuntimeError('claude CLI turn timed out') from a local OpenAI-compatible
shim was falling through to FailoverReason.unknown, surfacing as 'Empty
response from model' and burning 3 retry slots on the same failing
endpoint. _classify_by_message had no timeout-message branch — only
billing/rate_limit/auth/context_overflow/model_not_found patterns. The
type-based check at line 565 also requires isinstance(error, (TimeoutError,
ConnectionError, OSError)) — a plain RuntimeError doesn't match.
Add _TIMEOUT_MESSAGE_PATTERNS for 'timed out', 'deadline exceeded',
'request timed out', 'operation timed out', 'upstream timed out', 'turn
timed out'. _classify_by_message returns FailoverReason.timeout (retryable=True)
when any pattern matches.
Salvage of #22664's classifier portion. The original PR also bundled a
fallback self-selection guard which is now redundant (already on main
via #22780) plus DeepSeek thinking and session_search fixes that are
their own separate concerns.
Follow-up to #22780 — fixes the still-broken classification of
generic-typed provider-shim timeouts that #22780's dedup didn't cover.
This commit is contained in:
parent
6ddc48b058
commit
4f8d8ad912
2 changed files with 44 additions and 0 deletions
|
|
@ -254,6 +254,20 @@ _THINKING_SIG_PATTERNS = [
|
|||
"signature", # Combined with "thinking" check
|
||||
]
|
||||
|
||||
# Message-string patterns that indicate a provider-side timeout even when
|
||||
# the exception type is generic (e.g. RuntimeError from a local shim that
|
||||
# wraps a subprocess timeout). Checked before the type-based transport
|
||||
# heuristics so custom-provider "timed out" errors don't fall through to
|
||||
# the unknown bucket and get misreported as empty responses.
|
||||
_TIMEOUT_MESSAGE_PATTERNS = [
|
||||
"timed out",
|
||||
"turn timed out",
|
||||
"request timed out",
|
||||
"deadline exceeded",
|
||||
"operation timed out",
|
||||
"upstream timed out",
|
||||
]
|
||||
|
||||
# Transport error type names
|
||||
_TRANSPORT_ERROR_TYPES = frozenset({
|
||||
"ReadTimeout", "ConnectTimeout", "PoolTimeout",
|
||||
|
|
@ -963,6 +977,14 @@ def _classify_by_message(
|
|||
should_fallback=True,
|
||||
)
|
||||
|
||||
# Timeout message patterns — generic exception types (e.g. RuntimeError)
|
||||
# raised by local shims or custom providers that internally wrap a
|
||||
# subprocess/HTTP timeout. Classified as transport timeout so the retry
|
||||
# loop rebuilds the client instead of treating the turn as an empty
|
||||
# model response.
|
||||
if any(p in error_msg for p in _TIMEOUT_MESSAGE_PATTERNS):
|
||||
return result_fn(FailoverReason.timeout, retryable=True)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -587,6 +587,28 @@ class TestClassifyApiError:
|
|||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.timeout
|
||||
|
||||
def test_runtime_error_cli_turn_timed_out_classifies_as_timeout(self):
|
||||
# RuntimeError from a local claude-cli shim that wraps a subprocess
|
||||
# timeout must classify as FailoverReason.timeout, not unknown, so
|
||||
# the retry loop rebuilds the client instead of treating the turn as
|
||||
# an empty model response (#22548).
|
||||
e = RuntimeError("claude CLI turn timed out")
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.timeout
|
||||
assert result.retryable is True
|
||||
|
||||
def test_runtime_error_request_timed_out_classifies_as_timeout(self):
|
||||
e = RuntimeError("request timed out after 120s")
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.timeout
|
||||
assert result.retryable is True
|
||||
|
||||
def test_runtime_error_deadline_exceeded_classifies_as_timeout(self):
|
||||
e = RuntimeError("deadline exceeded")
|
||||
result = classify_api_error(e)
|
||||
assert result.reason == FailoverReason.timeout
|
||||
assert result.retryable is True
|
||||
|
||||
# ── Error code classification ──
|
||||
|
||||
def test_error_code_resource_exhausted(self):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue