mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-25 05:52:34 +00:00
fix(agent): retry malformed anthropic stream parser errors
This commit is contained in:
parent
53637fb17d
commit
9c304a7f56
2 changed files with 131 additions and 9 deletions
57
run_agent.py
57
run_agent.py
|
|
@ -3027,6 +3027,24 @@ class AIAgent:
|
||||||
parts.append(f"{type(e).__name__}({msg})" if msg else type(e).__name__)
|
parts.append(f"{type(e).__name__}({msg})" if msg else type(e).__name__)
|
||||||
return " <- ".join(parts) if parts else type(error).__name__
|
return " <- ".join(parts) if parts else type(error).__name__
|
||||||
|
|
||||||
|
def _is_provider_stream_parse_error(self, error: BaseException) -> bool:
|
||||||
|
"""Return True for malformed provider streaming data from SDK parsers.
|
||||||
|
|
||||||
|
Some Anthropic-compatible streaming providers can send a malformed
|
||||||
|
event-stream frame. The Anthropic SDK surfaces that as a plain
|
||||||
|
``ValueError`` such as ``expected ident at line 1 column 149``. That
|
||||||
|
is provider wire-format trouble, not local request validation, so it
|
||||||
|
should follow the same retry path as a truncated JSON body.
|
||||||
|
"""
|
||||||
|
if getattr(self, "api_mode", None) != "anthropic_messages":
|
||||||
|
return False
|
||||||
|
if not isinstance(error, ValueError):
|
||||||
|
return False
|
||||||
|
if isinstance(error, (UnicodeEncodeError, json.JSONDecodeError)):
|
||||||
|
return False
|
||||||
|
message = str(error).strip().lower()
|
||||||
|
return "expected ident at line" in message
|
||||||
|
|
||||||
def _log_stream_retry(
|
def _log_stream_retry(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
|
|
@ -5080,6 +5098,12 @@ class AIAgent:
|
||||||
"""
|
"""
|
||||||
raw = str(error)
|
raw = str(error)
|
||||||
|
|
||||||
|
if (
|
||||||
|
isinstance(error, ValueError)
|
||||||
|
and "expected ident at line" in raw.lower()
|
||||||
|
):
|
||||||
|
return f"Malformed provider streaming response: {raw[:300]}"
|
||||||
|
|
||||||
# Cloudflare / proxy HTML pages: grab the <title> for a clean summary
|
# Cloudflare / proxy HTML pages: grab the <title> for a clean summary
|
||||||
if "<!DOCTYPE" in raw or "<html" in raw:
|
if "<!DOCTYPE" in raw or "<html" in raw:
|
||||||
m = re.search(r"<title[^>]*>([^<]+)</title>", raw, re.IGNORECASE)
|
m = re.search(r"<title[^>]*>([^<]+)</title>", raw, re.IGNORECASE)
|
||||||
|
|
@ -8528,6 +8552,7 @@ class AIAgent:
|
||||||
_is_conn_err = isinstance(
|
_is_conn_err = isinstance(
|
||||||
e, (_httpx.ConnectError, _httpx.RemoteProtocolError, ConnectionError)
|
e, (_httpx.ConnectError, _httpx.RemoteProtocolError, ConnectionError)
|
||||||
)
|
)
|
||||||
|
_is_stream_parse_err = self._is_provider_stream_parse_error(e)
|
||||||
|
|
||||||
# If the stream died AFTER some tokens were delivered:
|
# If the stream died AFTER some tokens were delivered:
|
||||||
# normally we don't retry (the user already saw text,
|
# normally we don't retry (the user already saw text,
|
||||||
|
|
@ -8567,7 +8592,10 @@ class AIAgent:
|
||||||
for phrase in _SSE_PREVIEW_PHRASES
|
for phrase in _SSE_PREVIEW_PHRASES
|
||||||
)
|
)
|
||||||
_is_transient = (
|
_is_transient = (
|
||||||
_is_timeout or _is_conn_err or _is_sse_conn_err_preview
|
_is_timeout
|
||||||
|
or _is_conn_err
|
||||||
|
or _is_sse_conn_err_preview
|
||||||
|
or _is_stream_parse_err
|
||||||
)
|
)
|
||||||
_can_silent_retry = (
|
_can_silent_retry = (
|
||||||
_partial_tool_in_flight
|
_partial_tool_in_flight
|
||||||
|
|
@ -8665,7 +8693,7 @@ class AIAgent:
|
||||||
for phrase in _SSE_CONN_PHRASES
|
for phrase in _SSE_CONN_PHRASES
|
||||||
)
|
)
|
||||||
|
|
||||||
if _is_timeout or _is_conn_err or _is_sse_conn_err:
|
if _is_timeout or _is_conn_err or _is_sse_conn_err or _is_stream_parse_err:
|
||||||
# Transient network / timeout error. Retry the
|
# Transient network / timeout error. Retry the
|
||||||
# streaming request with a fresh connection first.
|
# streaming request with a fresh connection first.
|
||||||
if _stream_attempt < _max_stream_retries:
|
if _stream_attempt < _max_stream_retries:
|
||||||
|
|
@ -8706,12 +8734,20 @@ class AIAgent:
|
||||||
mid_tool_call=False,
|
mid_tool_call=False,
|
||||||
diag=request_client_holder.get("diag"),
|
diag=request_client_holder.get("diag"),
|
||||||
)
|
)
|
||||||
self._emit_status(
|
if _is_stream_parse_err:
|
||||||
"❌ Connection to provider failed after "
|
self._emit_status(
|
||||||
f"{_max_stream_retries + 1} attempts. "
|
"❌ Provider returned malformed streaming data after "
|
||||||
"The provider may be experiencing issues — "
|
f"{_max_stream_retries + 1} attempts. "
|
||||||
"try again in a moment."
|
"The provider may be experiencing issues — "
|
||||||
)
|
"try again in a moment."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self._emit_status(
|
||||||
|
"❌ Connection to provider failed after "
|
||||||
|
f"{_max_stream_retries + 1} attempts. "
|
||||||
|
"The provider may be experiencing issues — "
|
||||||
|
"try again in a moment."
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
_err_lower = str(e).lower()
|
_err_lower = str(e).lower()
|
||||||
_is_stream_unsupported = (
|
_is_stream_unsupported = (
|
||||||
|
|
@ -14509,11 +14545,16 @@ class AIAgent:
|
||||||
# provider/network failure (malformed response body,
|
# provider/network failure (malformed response body,
|
||||||
# truncated stream, routing layer corruption), not a
|
# truncated stream, routing layer corruption), not a
|
||||||
# local programming bug, and should be retried (#14782).
|
# local programming bug, and should be retried (#14782).
|
||||||
|
# Exclude Anthropic stream parser ValueErrors for the
|
||||||
|
# same reason: third-party Anthropic-compatible providers
|
||||||
|
# can emit malformed event-stream frames that SDK parsers
|
||||||
|
# raise as plain ValueError.
|
||||||
is_local_validation_error = (
|
is_local_validation_error = (
|
||||||
isinstance(api_error, (ValueError, TypeError))
|
isinstance(api_error, (ValueError, TypeError))
|
||||||
and not isinstance(
|
and not isinstance(
|
||||||
api_error, (UnicodeEncodeError, json.JSONDecodeError)
|
api_error, (UnicodeEncodeError, json.JSONDecodeError)
|
||||||
)
|
)
|
||||||
|
and not self._is_provider_stream_parse_error(api_error)
|
||||||
# ssl.SSLError (and its subclass SSLCertVerificationError)
|
# ssl.SSLError (and its subclass SSLCertVerificationError)
|
||||||
# inherits from OSError *and* ValueError via Python MRO,
|
# inherits from OSError *and* ValueError via Python MRO,
|
||||||
# so the isinstance(ValueError) check above would
|
# so the isinstance(ValueError) check above would
|
||||||
|
|
|
||||||
|
|
@ -999,6 +999,88 @@ class TestAnthropicStreamCallbacks:
|
||||||
|
|
||||||
assert touch_calls.count("receiving stream response") == len(events)
|
assert touch_calls.count("receiving stream response") == len(events)
|
||||||
|
|
||||||
|
@patch("run_agent.AIAgent._replace_primary_openai_client")
|
||||||
|
def test_anthropic_stream_parser_valueerror_retries_before_delivery(
|
||||||
|
self, mock_replace, monkeypatch,
|
||||||
|
):
|
||||||
|
"""Malformed Anthropic event-stream frames retry instead of surfacing HTTP None."""
|
||||||
|
from run_agent import AIAgent
|
||||||
|
|
||||||
|
agent = AIAgent(
|
||||||
|
api_key="test-key",
|
||||||
|
base_url="https://api.minimax.io/anthropic",
|
||||||
|
provider="minimax",
|
||||||
|
model="MiniMax-M2.7",
|
||||||
|
quiet_mode=True,
|
||||||
|
skip_context_files=True,
|
||||||
|
skip_memory=True,
|
||||||
|
)
|
||||||
|
agent.api_mode = "anthropic_messages"
|
||||||
|
agent._interrupt_requested = False
|
||||||
|
monkeypatch.setenv("HERMES_STREAM_RETRIES", "1")
|
||||||
|
|
||||||
|
class _BadStream:
|
||||||
|
response = None
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, *_args):
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
raise ValueError("expected ident at line 1 column 149")
|
||||||
|
|
||||||
|
final_message = SimpleNamespace(content=[], stop_reason="end_turn")
|
||||||
|
good_stream = MagicMock()
|
||||||
|
good_stream.__enter__ = MagicMock(return_value=good_stream)
|
||||||
|
good_stream.__exit__ = MagicMock(return_value=False)
|
||||||
|
good_stream.__iter__ = MagicMock(return_value=iter([]))
|
||||||
|
good_stream.get_final_message.return_value = final_message
|
||||||
|
|
||||||
|
agent._anthropic_client = MagicMock()
|
||||||
|
agent._anthropic_client.messages.stream.side_effect = [
|
||||||
|
_BadStream(),
|
||||||
|
good_stream,
|
||||||
|
]
|
||||||
|
|
||||||
|
response = agent._interruptible_streaming_api_call({})
|
||||||
|
|
||||||
|
assert response is final_message
|
||||||
|
assert agent._anthropic_client.messages.stream.call_count == 2
|
||||||
|
assert mock_replace.call_count == 1
|
||||||
|
|
||||||
|
@patch("run_agent.AIAgent._replace_primary_openai_client")
|
||||||
|
def test_generic_anthropic_valueerror_still_propagates_without_stream_retry(
|
||||||
|
self, mock_replace, monkeypatch,
|
||||||
|
):
|
||||||
|
"""Only known provider stream parser ValueErrors are treated as transient."""
|
||||||
|
from run_agent import AIAgent
|
||||||
|
|
||||||
|
agent = AIAgent(
|
||||||
|
api_key="test-key",
|
||||||
|
base_url="https://api.minimax.io/anthropic",
|
||||||
|
provider="minimax",
|
||||||
|
model="MiniMax-M2.7",
|
||||||
|
quiet_mode=True,
|
||||||
|
skip_context_files=True,
|
||||||
|
skip_memory=True,
|
||||||
|
)
|
||||||
|
agent.api_mode = "anthropic_messages"
|
||||||
|
agent._interrupt_requested = False
|
||||||
|
monkeypatch.setenv("HERMES_STREAM_RETRIES", "1")
|
||||||
|
|
||||||
|
agent._anthropic_client = MagicMock()
|
||||||
|
agent._anthropic_client.messages.stream.side_effect = ValueError(
|
||||||
|
"invalid local request shape"
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="invalid local request shape"):
|
||||||
|
agent._interruptible_streaming_api_call({})
|
||||||
|
|
||||||
|
assert agent._anthropic_client.messages.stream.call_count == 1
|
||||||
|
assert mock_replace.call_count == 0
|
||||||
|
|
||||||
|
|
||||||
class TestPartialToolCallWarning:
|
class TestPartialToolCallWarning:
|
||||||
"""Regression: when a stream dies mid tool-call argument generation after
|
"""Regression: when a stream dies mid tool-call argument generation after
|
||||||
|
|
@ -1504,4 +1586,3 @@ class TestCopilotACPStreamingDecision:
|
||||||
_use_streaming = False
|
_use_streaming = False
|
||||||
|
|
||||||
assert _use_streaming is True
|
assert _use_streaming is True
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue