From 023149f665bc2dc87dc200a6d3ce6fb091a2076b Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 1 Jun 2026 03:01:20 -0700 Subject: [PATCH] fix(agent): stop reporting broken streams as output-length truncation (#36705) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A stream that drops mid-response after tokens are delivered (peer-closed connection, stale-stream reconnect) is converted into a synthetic finish_reason="length" stub. The conversation loop treated that network stall as a max-output-tokens truncation: when the dropped content was a tool call it retried exactly once, then hard-failed with "Response truncated due to output length limit" — even on large-output models that never hit any cap (e.g. Opus). - Tool-call truncation now retries up to 3 times (was 1) with a progressive max_tokens boost, and is stub-aware: a PARTIAL_STREAM_STUB_ID stall prints "Stream interrupted mid tool-call — retrying (n/3)" instead of the false "model hit max output tokens", and the give-up message distinguishes a network drop from a real truncation. - Length-continuation retries preserve the original request's output cap as a floor, so a high provider/model default isn't silently downshifted to 8K/12K on retry. - Added _requested_output_cap_from_api_kwargs() helper. Tests: stub-stall mid-tool-call recovery within 3 retries; continuation preserves a large provider-default output cap. Fixes #26425. Salvages the substance of #26427 (cap floor) and #9525 (retry bump), adapted to the post-refactor conversation_loop.py which handles all three api_modes uniformly. Co-authored-by: LeonSGP43 Co-authored-by: ygd58 --- agent/conversation_loop.py | 64 +++++++++++++++++++++----- run_agent.py | 15 +++++++ tests/run_agent/test_run_agent.py | 75 ++++++++++++++++++++++++++++++- 3 files changed, 143 insertions(+), 11 deletions(-) diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index bb6c6229cdb..743988b03b0 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -1739,20 +1739,52 @@ def run_conversation( if agent.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}: assistant_message = _trunc_msg if assistant_message is not None and _trunc_has_tool_calls: - if truncated_tool_call_retries < 1: + _is_stub_stall = ( + getattr(response, "id", "") == PARTIAL_STREAM_STUB_ID + ) + if truncated_tool_call_retries < 3: truncated_tool_call_retries += 1 - agent._buffer_vprint( - f"⚠️ Truncated tool call detected — retrying API call..." - ) + if _is_stub_stall: + # The stream broke mid tool-call (network / + # peer-closed connection), not a real output + # cap — say so instead of "max output tokens". + agent._buffer_vprint( + f"⚠️ Stream interrupted mid tool-call — " + f"retrying ({truncated_tool_call_retries}/3)..." + ) + else: + agent._buffer_vprint( + f"⚠️ Truncated tool call detected — " + f"retrying API call " + f"({truncated_tool_call_retries}/3)..." + ) + # Boost max_tokens on each retry so the model has + # more room to complete the tool-call JSON. A + # network stall doesn't need a bigger budget, but + # a genuine output-cap truncation does, and the + # boost is harmless for the stall case. + _tc_boost_base = agent.max_tokens if agent.max_tokens else 4096 + _tc_boost = _tc_boost_base * (truncated_tool_call_retries + 1) + _tc_requested_cap = agent._requested_output_cap_from_api_kwargs(api_kwargs) + if _tc_requested_cap is not None: + _tc_boost = max(_tc_boost, _tc_requested_cap) + _tc_boost_cap = max(32768, _tc_requested_cap or 0) + agent._ephemeral_max_output_tokens = min(_tc_boost, _tc_boost_cap) # Don't append the broken response to messages; # just re-run the same API call from the current # message state, giving the model another chance. continue agent._flush_status_buffer() - agent._vprint( - f"{agent.log_prefix}⚠️ Truncated tool call response detected again — refusing to execute incomplete tool arguments.", - force=True, - ) + if _is_stub_stall: + agent._vprint( + f"{agent.log_prefix}⚠️ Stream kept dropping mid tool-call after 3 retries — the action was not executed.", + force=True, + ) + else: + agent._vprint( + f"{agent.log_prefix}⚠️ Truncated tool call response detected again — refusing to execute incomplete tool arguments.", + force=True, + ) agent._cleanup_task_resources(effective_task_id) agent._persist_session(messages, conversation_history) return { @@ -1761,7 +1793,12 @@ def run_conversation( "api_calls": api_call_count, "completed": False, "partial": True, - "error": "Response truncated due to output length limit", + "error": ( + "Stream repeatedly dropped mid tool-call (network); " + "the tool was not executed" + if _is_stub_stall + else "Response truncated due to output length limit" + ), } # If we have prior messages, roll back to last complete state @@ -3412,9 +3449,16 @@ def run_conversation( # Progressively boost the output token budget on each retry. # Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768. # Applies to all providers via _ephemeral_max_output_tokens. + # If the original request already used a larger provider/model + # default budget, keep that floor so continuation retries do + # not accidentally downshift to a much smaller cap. _boost_base = agent.max_tokens if agent.max_tokens else 4096 _boost = _boost_base * (length_continue_retries + 1) - agent._ephemeral_max_output_tokens = min(_boost, 32768) + _requested_cap = agent._requested_output_cap_from_api_kwargs(api_kwargs) + if _requested_cap is not None: + _boost = max(_boost, _requested_cap) + _boost_cap = max(32768, _requested_cap or 0) + agent._ephemeral_max_output_tokens = min(_boost, _boost_cap) continue # Guard: if all retries exhausted without a successful response diff --git a/run_agent.py b/run_agent.py index 34d112cb34e..bb93ff682a9 100644 --- a/run_agent.py +++ b/run_agent.py @@ -1206,6 +1206,21 @@ class AIAgent: return {"max_completion_tokens": value} return {"max_tokens": value} + @staticmethod + def _requested_output_cap_from_api_kwargs(api_kwargs: Any) -> Optional[int]: + """Extract the outgoing response token cap from a prepared request.""" + if not isinstance(api_kwargs, dict): + return None + for key in ("max_output_tokens", "max_completion_tokens", "max_tokens"): + raw = api_kwargs.get(key) + try: + value = int(raw) + except (TypeError, ValueError): + continue + if value > 0: + return value + return None + def _has_content_after_think_block(self, content: str) -> bool: """ Check if content has actual text after any reasoning/thinking blocks. diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py index a495b718320..63657a73001 100644 --- a/tests/run_agent/test_run_agent.py +++ b/tests/run_agent/test_run_agent.py @@ -3699,6 +3699,36 @@ class TestRunConversation: assert second_call_messages[-1]["role"] == "user" assert "truncated by the output length limit" in second_call_messages[-1]["content"] + def test_length_continuation_preserves_large_provider_default_output_cap(self, agent): + """Continuation retries must not shrink a higher provider default cap.""" + self._setup_agent(agent) + agent.max_tokens = None + requested_caps = [] + + def _fake_build_api_kwargs(api_messages): + ephemeral = getattr(agent, "_ephemeral_max_output_tokens", None) + if ephemeral is not None: + agent._ephemeral_max_output_tokens = None + cap = ephemeral if ephemeral is not None else 65536 + requested_caps.append(cap) + return {"model": agent.model, "messages": api_messages, "max_tokens": cap} + + first = _mock_response(content="Part 1 ", finish_reason="length") + second = _mock_response(content="Part 2", finish_reason="stop") + agent.client.chat.completions.create.side_effect = [first, second] + + with ( + patch.object(agent, "_build_api_kwargs", side_effect=_fake_build_api_kwargs), + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + result = agent.run_conversation("hello") + + assert result["completed"] is True + assert result["final_response"] == "Part 1 Part 2" + assert requested_caps == [65536, 65536] + def test_ollama_glm_stop_after_tools_without_terminal_boundary_requests_continuation(self, agent): """Ollama-hosted GLM responses can misreport truncated output as stop.""" self._setup_agent(agent) @@ -3877,7 +3907,8 @@ class TestRunConversation: def test_truncated_tool_call_retries_once_before_refusing(self, agent): """When tool call args are truncated, the agent retries the API call - once. If the retry succeeds (valid JSON args), tool execution proceeds.""" + (up to 3 times). If a retry succeeds (valid JSON args), tool execution + proceeds.""" self._setup_agent(agent) agent.valid_tool_names.add("write_file") bad_tc = _mock_tool_call( @@ -3914,6 +3945,48 @@ class TestRunConversation: mock_hfc.assert_called_once() assert result["final_response"] == "Done!" + def test_stub_stall_mid_tool_call_recovers_within_3_retries(self, agent): + """A network stream stall mid tool-call (PARTIAL_STREAM_STUB_ID) must + retry up to 3 times rather than hard-failing after one — and recover + if a retry produces a complete tool call. Regression for the false + 'model hit max output tokens' on Opus when the stream simply dropped.""" + from hermes_constants import PARTIAL_STREAM_STUB_ID + + self._setup_agent(agent) + agent.valid_tool_names.add("write_file") + bad_tc = _mock_tool_call( + name="write_file", + arguments='{"path":"report.md","content":"partial', + call_id="c1", + ) + # Two consecutive stub-stall responses, then a clean tool call. + stall1 = _mock_response(content="", finish_reason="length", tool_calls=[bad_tc]) + stall1.id = PARTIAL_STREAM_STUB_ID + stall2 = _mock_response(content="", finish_reason="length", tool_calls=[bad_tc]) + stall2.id = PARTIAL_STREAM_STUB_ID + good_tc = _mock_tool_call( + name="write_file", + arguments='{"path":"report.md","content":"full content"}', + call_id="c2", + ) + good_resp = _mock_response(content="", finish_reason="stop", tool_calls=[good_tc]) + final_resp = _mock_response(content="Done!", finish_reason="stop") + + with ( + patch("run_agent.handle_function_call", return_value='{"success":true}') as mock_hfc, + patch.object(agent, "_persist_session"), + patch.object(agent, "_save_trajectory"), + patch.object(agent, "_cleanup_task_resources"), + ): + agent.client.chat.completions.create.side_effect = [ + stall1, stall2, good_resp, final_resp, + ] + result = agent.run_conversation("write the report") + + # Recovered on the 3rd attempt instead of refusing after the 1st. + mock_hfc.assert_called_once() + assert result["final_response"] == "Done!" + def test_truncated_tool_args_detected_when_finish_reason_not_length(self, agent): """When a router rewrites finish_reason from 'length' to 'tool_calls', truncated JSON arguments should still be detected and refused rather