mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
fix(agent): stop reporting broken streams as output-length truncation (#36705)
A stream that drops mid-response after tokens are delivered (peer-closed connection, stale-stream reconnect) is converted into a synthetic finish_reason="length" stub. The conversation loop treated that network stall as a max-output-tokens truncation: when the dropped content was a tool call it retried exactly once, then hard-failed with "Response truncated due to output length limit" — even on large-output models that never hit any cap (e.g. Opus). - Tool-call truncation now retries up to 3 times (was 1) with a progressive max_tokens boost, and is stub-aware: a PARTIAL_STREAM_STUB_ID stall prints "Stream interrupted mid tool-call — retrying (n/3)" instead of the false "model hit max output tokens", and the give-up message distinguishes a network drop from a real truncation. - Length-continuation retries preserve the original request's output cap as a floor, so a high provider/model default isn't silently downshifted to 8K/12K on retry. - Added _requested_output_cap_from_api_kwargs() helper. Tests: stub-stall mid-tool-call recovery within 3 retries; continuation preserves a large provider-default output cap. Fixes #26425. Salvages the substance of #26427 (cap floor) and #9525 (retry bump), adapted to the post-refactor conversation_loop.py which handles all three api_modes uniformly. Co-authored-by: LeonSGP43 <cine.dreamer.one@gmail.com> Co-authored-by: ygd58 <ygd58@users.noreply.github.com>
This commit is contained in:
parent
b571ec298d
commit
023149f665
3 changed files with 143 additions and 11 deletions
|
|
@ -1739,20 +1739,52 @@ def run_conversation(
|
|||
if agent.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}:
|
||||
assistant_message = _trunc_msg
|
||||
if assistant_message is not None and _trunc_has_tool_calls:
|
||||
if truncated_tool_call_retries < 1:
|
||||
_is_stub_stall = (
|
||||
getattr(response, "id", "") == PARTIAL_STREAM_STUB_ID
|
||||
)
|
||||
if truncated_tool_call_retries < 3:
|
||||
truncated_tool_call_retries += 1
|
||||
agent._buffer_vprint(
|
||||
f"⚠️ Truncated tool call detected — retrying API call..."
|
||||
)
|
||||
if _is_stub_stall:
|
||||
# The stream broke mid tool-call (network /
|
||||
# peer-closed connection), not a real output
|
||||
# cap — say so instead of "max output tokens".
|
||||
agent._buffer_vprint(
|
||||
f"⚠️ Stream interrupted mid tool-call — "
|
||||
f"retrying ({truncated_tool_call_retries}/3)..."
|
||||
)
|
||||
else:
|
||||
agent._buffer_vprint(
|
||||
f"⚠️ Truncated tool call detected — "
|
||||
f"retrying API call "
|
||||
f"({truncated_tool_call_retries}/3)..."
|
||||
)
|
||||
# Boost max_tokens on each retry so the model has
|
||||
# more room to complete the tool-call JSON. A
|
||||
# network stall doesn't need a bigger budget, but
|
||||
# a genuine output-cap truncation does, and the
|
||||
# boost is harmless for the stall case.
|
||||
_tc_boost_base = agent.max_tokens if agent.max_tokens else 4096
|
||||
_tc_boost = _tc_boost_base * (truncated_tool_call_retries + 1)
|
||||
_tc_requested_cap = agent._requested_output_cap_from_api_kwargs(api_kwargs)
|
||||
if _tc_requested_cap is not None:
|
||||
_tc_boost = max(_tc_boost, _tc_requested_cap)
|
||||
_tc_boost_cap = max(32768, _tc_requested_cap or 0)
|
||||
agent._ephemeral_max_output_tokens = min(_tc_boost, _tc_boost_cap)
|
||||
# Don't append the broken response to messages;
|
||||
# just re-run the same API call from the current
|
||||
# message state, giving the model another chance.
|
||||
continue
|
||||
agent._flush_status_buffer()
|
||||
agent._vprint(
|
||||
f"{agent.log_prefix}⚠️ Truncated tool call response detected again — refusing to execute incomplete tool arguments.",
|
||||
force=True,
|
||||
)
|
||||
if _is_stub_stall:
|
||||
agent._vprint(
|
||||
f"{agent.log_prefix}⚠️ Stream kept dropping mid tool-call after 3 retries — the action was not executed.",
|
||||
force=True,
|
||||
)
|
||||
else:
|
||||
agent._vprint(
|
||||
f"{agent.log_prefix}⚠️ Truncated tool call response detected again — refusing to execute incomplete tool arguments.",
|
||||
force=True,
|
||||
)
|
||||
agent._cleanup_task_resources(effective_task_id)
|
||||
agent._persist_session(messages, conversation_history)
|
||||
return {
|
||||
|
|
@ -1761,7 +1793,12 @@ def run_conversation(
|
|||
"api_calls": api_call_count,
|
||||
"completed": False,
|
||||
"partial": True,
|
||||
"error": "Response truncated due to output length limit",
|
||||
"error": (
|
||||
"Stream repeatedly dropped mid tool-call (network); "
|
||||
"the tool was not executed"
|
||||
if _is_stub_stall
|
||||
else "Response truncated due to output length limit"
|
||||
),
|
||||
}
|
||||
|
||||
# If we have prior messages, roll back to last complete state
|
||||
|
|
@ -3412,9 +3449,16 @@ def run_conversation(
|
|||
# Progressively boost the output token budget on each retry.
|
||||
# Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768.
|
||||
# Applies to all providers via _ephemeral_max_output_tokens.
|
||||
# If the original request already used a larger provider/model
|
||||
# default budget, keep that floor so continuation retries do
|
||||
# not accidentally downshift to a much smaller cap.
|
||||
_boost_base = agent.max_tokens if agent.max_tokens else 4096
|
||||
_boost = _boost_base * (length_continue_retries + 1)
|
||||
agent._ephemeral_max_output_tokens = min(_boost, 32768)
|
||||
_requested_cap = agent._requested_output_cap_from_api_kwargs(api_kwargs)
|
||||
if _requested_cap is not None:
|
||||
_boost = max(_boost, _requested_cap)
|
||||
_boost_cap = max(32768, _requested_cap or 0)
|
||||
agent._ephemeral_max_output_tokens = min(_boost, _boost_cap)
|
||||
continue
|
||||
|
||||
# Guard: if all retries exhausted without a successful response
|
||||
|
|
|
|||
15
run_agent.py
15
run_agent.py
|
|
@ -1206,6 +1206,21 @@ class AIAgent:
|
|||
return {"max_completion_tokens": value}
|
||||
return {"max_tokens": value}
|
||||
|
||||
@staticmethod
|
||||
def _requested_output_cap_from_api_kwargs(api_kwargs: Any) -> Optional[int]:
|
||||
"""Extract the outgoing response token cap from a prepared request."""
|
||||
if not isinstance(api_kwargs, dict):
|
||||
return None
|
||||
for key in ("max_output_tokens", "max_completion_tokens", "max_tokens"):
|
||||
raw = api_kwargs.get(key)
|
||||
try:
|
||||
value = int(raw)
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
if value > 0:
|
||||
return value
|
||||
return None
|
||||
|
||||
def _has_content_after_think_block(self, content: str) -> bool:
|
||||
"""
|
||||
Check if content has actual text after any reasoning/thinking blocks.
|
||||
|
|
|
|||
|
|
@ -3699,6 +3699,36 @@ class TestRunConversation:
|
|||
assert second_call_messages[-1]["role"] == "user"
|
||||
assert "truncated by the output length limit" in second_call_messages[-1]["content"]
|
||||
|
||||
def test_length_continuation_preserves_large_provider_default_output_cap(self, agent):
|
||||
"""Continuation retries must not shrink a higher provider default cap."""
|
||||
self._setup_agent(agent)
|
||||
agent.max_tokens = None
|
||||
requested_caps = []
|
||||
|
||||
def _fake_build_api_kwargs(api_messages):
|
||||
ephemeral = getattr(agent, "_ephemeral_max_output_tokens", None)
|
||||
if ephemeral is not None:
|
||||
agent._ephemeral_max_output_tokens = None
|
||||
cap = ephemeral if ephemeral is not None else 65536
|
||||
requested_caps.append(cap)
|
||||
return {"model": agent.model, "messages": api_messages, "max_tokens": cap}
|
||||
|
||||
first = _mock_response(content="Part 1 ", finish_reason="length")
|
||||
second = _mock_response(content="Part 2", finish_reason="stop")
|
||||
agent.client.chat.completions.create.side_effect = [first, second]
|
||||
|
||||
with (
|
||||
patch.object(agent, "_build_api_kwargs", side_effect=_fake_build_api_kwargs),
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
result = agent.run_conversation("hello")
|
||||
|
||||
assert result["completed"] is True
|
||||
assert result["final_response"] == "Part 1 Part 2"
|
||||
assert requested_caps == [65536, 65536]
|
||||
|
||||
def test_ollama_glm_stop_after_tools_without_terminal_boundary_requests_continuation(self, agent):
|
||||
"""Ollama-hosted GLM responses can misreport truncated output as stop."""
|
||||
self._setup_agent(agent)
|
||||
|
|
@ -3877,7 +3907,8 @@ class TestRunConversation:
|
|||
|
||||
def test_truncated_tool_call_retries_once_before_refusing(self, agent):
|
||||
"""When tool call args are truncated, the agent retries the API call
|
||||
once. If the retry succeeds (valid JSON args), tool execution proceeds."""
|
||||
(up to 3 times). If a retry succeeds (valid JSON args), tool execution
|
||||
proceeds."""
|
||||
self._setup_agent(agent)
|
||||
agent.valid_tool_names.add("write_file")
|
||||
bad_tc = _mock_tool_call(
|
||||
|
|
@ -3914,6 +3945,48 @@ class TestRunConversation:
|
|||
mock_hfc.assert_called_once()
|
||||
assert result["final_response"] == "Done!"
|
||||
|
||||
def test_stub_stall_mid_tool_call_recovers_within_3_retries(self, agent):
|
||||
"""A network stream stall mid tool-call (PARTIAL_STREAM_STUB_ID) must
|
||||
retry up to 3 times rather than hard-failing after one — and recover
|
||||
if a retry produces a complete tool call. Regression for the false
|
||||
'model hit max output tokens' on Opus when the stream simply dropped."""
|
||||
from hermes_constants import PARTIAL_STREAM_STUB_ID
|
||||
|
||||
self._setup_agent(agent)
|
||||
agent.valid_tool_names.add("write_file")
|
||||
bad_tc = _mock_tool_call(
|
||||
name="write_file",
|
||||
arguments='{"path":"report.md","content":"partial',
|
||||
call_id="c1",
|
||||
)
|
||||
# Two consecutive stub-stall responses, then a clean tool call.
|
||||
stall1 = _mock_response(content="", finish_reason="length", tool_calls=[bad_tc])
|
||||
stall1.id = PARTIAL_STREAM_STUB_ID
|
||||
stall2 = _mock_response(content="", finish_reason="length", tool_calls=[bad_tc])
|
||||
stall2.id = PARTIAL_STREAM_STUB_ID
|
||||
good_tc = _mock_tool_call(
|
||||
name="write_file",
|
||||
arguments='{"path":"report.md","content":"full content"}',
|
||||
call_id="c2",
|
||||
)
|
||||
good_resp = _mock_response(content="", finish_reason="stop", tool_calls=[good_tc])
|
||||
final_resp = _mock_response(content="Done!", finish_reason="stop")
|
||||
|
||||
with (
|
||||
patch("run_agent.handle_function_call", return_value='{"success":true}') as mock_hfc,
|
||||
patch.object(agent, "_persist_session"),
|
||||
patch.object(agent, "_save_trajectory"),
|
||||
patch.object(agent, "_cleanup_task_resources"),
|
||||
):
|
||||
agent.client.chat.completions.create.side_effect = [
|
||||
stall1, stall2, good_resp, final_resp,
|
||||
]
|
||||
result = agent.run_conversation("write the report")
|
||||
|
||||
# Recovered on the 3rd attempt instead of refusing after the 1st.
|
||||
mock_hfc.assert_called_once()
|
||||
assert result["final_response"] == "Done!"
|
||||
|
||||
def test_truncated_tool_args_detected_when_finish_reason_not_length(self, agent):
|
||||
"""When a router rewrites finish_reason from 'length' to 'tool_calls',
|
||||
truncated JSON arguments should still be detected and refused rather
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue