fix(agent): stop reporting broken streams as output-length truncation (#36705)

A stream that drops mid-response after tokens are delivered (peer-closed connection, stale-stream reconnect) is converted into a synthetic finish_reason="length" stub. The conversation loop treated that network stall as a max-output-tokens truncation: when the dropped content was a tool call it retried exactly once, then hard-failed with "Response truncated due to output length limit" — even on large-output models that never hit any cap (e.g. Opus). - Tool-call truncation now retries up to 3 times (was 1) with a progressive max_tokens boost, and is stub-aware: a PARTIAL_STREAM_STUB_ID stall prints "Stream interrupted mid tool-call — retrying (n/3)" instead of the false "model hit max output tokens", and the give-up message distinguishes a network drop from a real truncation. - Length-continuation retries preserve the original request's output cap as a floor, so a high provider/model default isn't silently downshifted to 8K/12K on retry. - Added _requested_output_cap_from_api_kwargs() helper. Tests: stub-stall mid-tool-call recovery within 3 retries; continuation preserves a large provider-default output cap. Fixes #26425. Salvages the substance of #26427 (cap floor) and #9525 (retry bump), adapted to the post-refactor conversation_loop.py which handles all three api_modes uniformly. Co-authored-by: LeonSGP43 <cine.dreamer.one@gmail.com> Co-authored-by: ygd58 <ygd58@users.noreply.github.com>
2026-07-24 16:54:43 +00:00 · 2026-06-01 03:01:20 -07:00 · 2026-06-01 03:01:20 -07:00 · 023149f665
commit 023149f665
parent b571ec298d
3 changed files with 143 additions and 11 deletions
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@ -1739,20 +1739,52 @@ def run_conversation(
                    if agent.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}:
                        assistant_message = _trunc_msg
                        if assistant_message is not None and _trunc_has_tool_calls:
-                            if truncated_tool_call_retries < 1:
+                            _is_stub_stall = (
+                                getattr(response, "id", "") == PARTIAL_STREAM_STUB_ID
+                            )
+                            if truncated_tool_call_retries < 3:
                                truncated_tool_call_retries += 1
-                                agent._buffer_vprint(
-                                    f"⚠️  Truncated tool call detected — retrying API call..."
-                                )
+                                if _is_stub_stall:
+                                    # The stream broke mid tool-call (network /
+                                    # peer-closed connection), not a real output
+                                    # cap — say so instead of "max output tokens".
+                                    agent._buffer_vprint(
+                                        f"⚠️  Stream interrupted mid tool-call — "
+                                        f"retrying ({truncated_tool_call_retries}/3)..."
+                                    )
+                                else:
+                                    agent._buffer_vprint(
+                                        f"⚠️  Truncated tool call detected — "
+                                        f"retrying API call "
+                                        f"({truncated_tool_call_retries}/3)..."
+                                    )
+                                # Boost max_tokens on each retry so the model has
+                                # more room to complete the tool-call JSON. A
+                                # network stall doesn't need a bigger budget, but
+                                # a genuine output-cap truncation does, and the
+                                # boost is harmless for the stall case.
+                                _tc_boost_base = agent.max_tokens if agent.max_tokens else 4096
+                                _tc_boost = _tc_boost_base * (truncated_tool_call_retries + 1)
+                                _tc_requested_cap = agent._requested_output_cap_from_api_kwargs(api_kwargs)
+                                if _tc_requested_cap is not None:
+                                    _tc_boost = max(_tc_boost, _tc_requested_cap)
+                                _tc_boost_cap = max(32768, _tc_requested_cap or 0)
+                                agent._ephemeral_max_output_tokens = min(_tc_boost, _tc_boost_cap)
                                # Don't append the broken response to messages;
                                # just re-run the same API call from the current
                                # message state, giving the model another chance.
                                continue
                            agent._flush_status_buffer()
-                            agent._vprint(
-                                f"{agent.log_prefix}⚠️  Truncated tool call response detected again — refusing to execute incomplete tool arguments.",
-                                force=True,
-                            )
+                            if _is_stub_stall:
+                                agent._vprint(
+                                    f"{agent.log_prefix}⚠️  Stream kept dropping mid tool-call after 3 retries — the action was not executed.",
+                                    force=True,
+                                )
+                            else:
+                                agent._vprint(
+                                    f"{agent.log_prefix}⚠️  Truncated tool call response detected again — refusing to execute incomplete tool arguments.",
+                                    force=True,
+                                )
                            agent._cleanup_task_resources(effective_task_id)
                            agent._persist_session(messages, conversation_history)
                            return {
@ -1761,7 +1793,12 @@ def run_conversation(
                                "api_calls": api_call_count,
                                "completed": False,
                                "partial": True,
-                                "error": "Response truncated due to output length limit",
+                                "error": (
+                                    "Stream repeatedly dropped mid tool-call (network); "
+                                    "the tool was not executed"
+                                    if _is_stub_stall
+                                    else "Response truncated due to output length limit"
+                                ),
                            }

                    # If we have prior messages, roll back to last complete state
@ -3412,9 +3449,16 @@ def run_conversation(
            # Progressively boost the output token budget on each retry.
            # Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768.
            # Applies to all providers via _ephemeral_max_output_tokens.
+            # If the original request already used a larger provider/model
+            # default budget, keep that floor so continuation retries do
+            # not accidentally downshift to a much smaller cap.
            _boost_base = agent.max_tokens if agent.max_tokens else 4096
            _boost = _boost_base * (length_continue_retries + 1)
-            agent._ephemeral_max_output_tokens = min(_boost, 32768)
+            _requested_cap = agent._requested_output_cap_from_api_kwargs(api_kwargs)
+            if _requested_cap is not None:
+                _boost = max(_boost, _requested_cap)
+            _boost_cap = max(32768, _requested_cap or 0)
+            agent._ephemeral_max_output_tokens = min(_boost, _boost_cap)
            continue

        # Guard: if all retries exhausted without a successful response
--- a/run_agent.py
+++ b/run_agent.py
@ -1206,6 +1206,21 @@ class AIAgent:
            return {"max_completion_tokens": value}
        return {"max_tokens": value}

+    @staticmethod
+    def _requested_output_cap_from_api_kwargs(api_kwargs: Any) -> Optional[int]:
+        """Extract the outgoing response token cap from a prepared request."""
+        if not isinstance(api_kwargs, dict):
+            return None
+        for key in ("max_output_tokens", "max_completion_tokens", "max_tokens"):
+            raw = api_kwargs.get(key)
+            try:
+                value = int(raw)
+            except (TypeError, ValueError):
+                continue
+            if value > 0:
+                return value
+        return None
+
    def _has_content_after_think_block(self, content: str) -> bool:
        """
        Check if content has actual text after any reasoning/thinking blocks.
--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@ -3699,6 +3699,36 @@ class TestRunConversation:
        assert second_call_messages[-1]["role"] == "user"
        assert "truncated by the output length limit" in second_call_messages[-1]["content"]

+    def test_length_continuation_preserves_large_provider_default_output_cap(self, agent):
+        """Continuation retries must not shrink a higher provider default cap."""
+        self._setup_agent(agent)
+        agent.max_tokens = None
+        requested_caps = []
+
+        def _fake_build_api_kwargs(api_messages):
+            ephemeral = getattr(agent, "_ephemeral_max_output_tokens", None)
+            if ephemeral is not None:
+                agent._ephemeral_max_output_tokens = None
+            cap = ephemeral if ephemeral is not None else 65536
+            requested_caps.append(cap)
+            return {"model": agent.model, "messages": api_messages, "max_tokens": cap}
+
+        first = _mock_response(content="Part 1 ", finish_reason="length")
+        second = _mock_response(content="Part 2", finish_reason="stop")
+        agent.client.chat.completions.create.side_effect = [first, second]
+
+        with (
+            patch.object(agent, "_build_api_kwargs", side_effect=_fake_build_api_kwargs),
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+        ):
+            result = agent.run_conversation("hello")
+
+        assert result["completed"] is True
+        assert result["final_response"] == "Part 1 Part 2"
+        assert requested_caps == [65536, 65536]
+
    def test_ollama_glm_stop_after_tools_without_terminal_boundary_requests_continuation(self, agent):
        """Ollama-hosted GLM responses can misreport truncated output as stop."""
        self._setup_agent(agent)
@ -3877,7 +3907,8 @@ class TestRunConversation:

    def test_truncated_tool_call_retries_once_before_refusing(self, agent):
        """When tool call args are truncated, the agent retries the API call
-        once. If the retry succeeds (valid JSON args), tool execution proceeds."""
+        (up to 3 times). If a retry succeeds (valid JSON args), tool execution
+        proceeds."""
        self._setup_agent(agent)
        agent.valid_tool_names.add("write_file")
        bad_tc = _mock_tool_call(
@ -3914,6 +3945,48 @@ class TestRunConversation:
        mock_hfc.assert_called_once()
        assert result["final_response"] == "Done!"

+    def test_stub_stall_mid_tool_call_recovers_within_3_retries(self, agent):
+        """A network stream stall mid tool-call (PARTIAL_STREAM_STUB_ID) must
+        retry up to 3 times rather than hard-failing after one — and recover
+        if a retry produces a complete tool call. Regression for the false
+        'model hit max output tokens' on Opus when the stream simply dropped."""
+        from hermes_constants import PARTIAL_STREAM_STUB_ID
+
+        self._setup_agent(agent)
+        agent.valid_tool_names.add("write_file")
+        bad_tc = _mock_tool_call(
+            name="write_file",
+            arguments='{"path":"report.md","content":"partial',
+            call_id="c1",
+        )
+        # Two consecutive stub-stall responses, then a clean tool call.
+        stall1 = _mock_response(content="", finish_reason="length", tool_calls=[bad_tc])
+        stall1.id = PARTIAL_STREAM_STUB_ID
+        stall2 = _mock_response(content="", finish_reason="length", tool_calls=[bad_tc])
+        stall2.id = PARTIAL_STREAM_STUB_ID
+        good_tc = _mock_tool_call(
+            name="write_file",
+            arguments='{"path":"report.md","content":"full content"}',
+            call_id="c2",
+        )
+        good_resp = _mock_response(content="", finish_reason="stop", tool_calls=[good_tc])
+        final_resp = _mock_response(content="Done!", finish_reason="stop")
+
+        with (
+            patch("run_agent.handle_function_call", return_value='{"success":true}') as mock_hfc,
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+        ):
+            agent.client.chat.completions.create.side_effect = [
+                stall1, stall2, good_resp, final_resp,
+            ]
+            result = agent.run_conversation("write the report")
+
+        # Recovered on the 3rd attempt instead of refusing after the 1st.
+        mock_hfc.assert_called_once()
+        assert result["final_response"] == "Done!"
+
    def test_truncated_tool_args_detected_when_finish_reason_not_length(self, agent):
        """When a router rewrites finish_reason from 'length' to 'tool_calls',
        truncated JSON arguments should still be detected and refused rather