From 023149f665bc2dc87dc200a6d3ce6fb091a2076b Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Mon, 1 Jun 2026 03:01:20 -0700
Subject: [PATCH] fix(agent): stop reporting broken streams as output-length
 truncation (#36705)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A stream that drops mid-response after tokens are delivered (peer-closed
connection, stale-stream reconnect) is converted into a synthetic
finish_reason="length" stub. The conversation loop treated that network
stall as a max-output-tokens truncation: when the dropped content was a
tool call it retried exactly once, then hard-failed with "Response
truncated due to output length limit" — even on large-output models that
never hit any cap (e.g. Opus).

- Tool-call truncation now retries up to 3 times (was 1) with a
  progressive max_tokens boost, and is stub-aware: a PARTIAL_STREAM_STUB_ID
  stall prints "Stream interrupted mid tool-call — retrying (n/3)" instead
  of the false "model hit max output tokens", and the give-up message
  distinguishes a network drop from a real truncation.
- Length-continuation retries preserve the original request's output cap
  as a floor, so a high provider/model default isn't silently downshifted
  to 8K/12K on retry.
- Added _requested_output_cap_from_api_kwargs() helper.

Tests: stub-stall mid-tool-call recovery within 3 retries; continuation
preserves a large provider-default output cap.

Fixes #26425. Salvages the substance of #26427 (cap floor) and #9525
(retry bump), adapted to the post-refactor conversation_loop.py which
handles all three api_modes uniformly.

Co-authored-by: LeonSGP43 <cine.dreamer.one@gmail.com>
Co-authored-by: ygd58 <ygd58@users.noreply.github.com>
---
 agent/conversation_loop.py        | 64 +++++++++++++++++++++-----
 run_agent.py                      | 15 +++++++
 tests/run_agent/test_run_agent.py | 75 ++++++++++++++++++++++++++++++-
 3 files changed, 143 insertions(+), 11 deletions(-)

diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py
index bb6c6229cdb..743988b03b0 100644
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@@ -1739,20 +1739,52 @@ def run_conversation(
                     if agent.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}:
                         assistant_message = _trunc_msg
                         if assistant_message is not None and _trunc_has_tool_calls:
-                            if truncated_tool_call_retries < 1:
+                            _is_stub_stall = (
+                                getattr(response, "id", "") == PARTIAL_STREAM_STUB_ID
+                            )
+                            if truncated_tool_call_retries < 3:
                                 truncated_tool_call_retries += 1
-                                agent._buffer_vprint(
-                                    f"⚠️  Truncated tool call detected — retrying API call..."
-                                )
+                                if _is_stub_stall:
+                                    # The stream broke mid tool-call (network /
+                                    # peer-closed connection), not a real output
+                                    # cap — say so instead of "max output tokens".
+                                    agent._buffer_vprint(
+                                        f"⚠️  Stream interrupted mid tool-call — "
+                                        f"retrying ({truncated_tool_call_retries}/3)..."
+                                    )
+                                else:
+                                    agent._buffer_vprint(
+                                        f"⚠️  Truncated tool call detected — "
+                                        f"retrying API call "
+                                        f"({truncated_tool_call_retries}/3)..."
+                                    )
+                                # Boost max_tokens on each retry so the model has
+                                # more room to complete the tool-call JSON. A
+                                # network stall doesn't need a bigger budget, but
+                                # a genuine output-cap truncation does, and the
+                                # boost is harmless for the stall case.
+                                _tc_boost_base = agent.max_tokens if agent.max_tokens else 4096
+                                _tc_boost = _tc_boost_base * (truncated_tool_call_retries + 1)
+                                _tc_requested_cap = agent._requested_output_cap_from_api_kwargs(api_kwargs)
+                                if _tc_requested_cap is not None:
+                                    _tc_boost = max(_tc_boost, _tc_requested_cap)
+                                _tc_boost_cap = max(32768, _tc_requested_cap or 0)
+                                agent._ephemeral_max_output_tokens = min(_tc_boost, _tc_boost_cap)
                                 # Don't append the broken response to messages;
                                 # just re-run the same API call from the current
                                 # message state, giving the model another chance.
                                 continue
                             agent._flush_status_buffer()
-                            agent._vprint(
-                                f"{agent.log_prefix}⚠️  Truncated tool call response detected again — refusing to execute incomplete tool arguments.",
-                                force=True,
-                            )
+                            if _is_stub_stall:
+                                agent._vprint(
+                                    f"{agent.log_prefix}⚠️  Stream kept dropping mid tool-call after 3 retries — the action was not executed.",
+                                    force=True,
+                                )
+                            else:
+                                agent._vprint(
+                                    f"{agent.log_prefix}⚠️  Truncated tool call response detected again — refusing to execute incomplete tool arguments.",
+                                    force=True,
+                                )
                             agent._cleanup_task_resources(effective_task_id)
                             agent._persist_session(messages, conversation_history)
                             return {
@@ -1761,7 +1793,12 @@ def run_conversation(
                                 "api_calls": api_call_count,
                                 "completed": False,
                                 "partial": True,
-                                "error": "Response truncated due to output length limit",
+                                "error": (
+                                    "Stream repeatedly dropped mid tool-call (network); "
+                                    "the tool was not executed"
+                                    if _is_stub_stall
+                                    else "Response truncated due to output length limit"
+                                ),
                             }
 
                     # If we have prior messages, roll back to last complete state
@@ -3412,9 +3449,16 @@ def run_conversation(
             # Progressively boost the output token budget on each retry.
             # Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768.
             # Applies to all providers via _ephemeral_max_output_tokens.
+            # If the original request already used a larger provider/model
+            # default budget, keep that floor so continuation retries do
+            # not accidentally downshift to a much smaller cap.
             _boost_base = agent.max_tokens if agent.max_tokens else 4096
             _boost = _boost_base * (length_continue_retries + 1)
-            agent._ephemeral_max_output_tokens = min(_boost, 32768)
+            _requested_cap = agent._requested_output_cap_from_api_kwargs(api_kwargs)
+            if _requested_cap is not None:
+                _boost = max(_boost, _requested_cap)
+            _boost_cap = max(32768, _requested_cap or 0)
+            agent._ephemeral_max_output_tokens = min(_boost, _boost_cap)
             continue
 
         # Guard: if all retries exhausted without a successful response
diff --git a/run_agent.py b/run_agent.py
index 34d112cb34e..bb93ff682a9 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -1206,6 +1206,21 @@ class AIAgent:
             return {"max_completion_tokens": value}
         return {"max_tokens": value}
 
+    @staticmethod
+    def _requested_output_cap_from_api_kwargs(api_kwargs: Any) -> Optional[int]:
+        """Extract the outgoing response token cap from a prepared request."""
+        if not isinstance(api_kwargs, dict):
+            return None
+        for key in ("max_output_tokens", "max_completion_tokens", "max_tokens"):
+            raw = api_kwargs.get(key)
+            try:
+                value = int(raw)
+            except (TypeError, ValueError):
+                continue
+            if value > 0:
+                return value
+        return None
+
     def _has_content_after_think_block(self, content: str) -> bool:
         """
         Check if content has actual text after any reasoning/thinking blocks.
diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py
index a495b718320..63657a73001 100644
--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@@ -3699,6 +3699,36 @@ class TestRunConversation:
         assert second_call_messages[-1]["role"] == "user"
         assert "truncated by the output length limit" in second_call_messages[-1]["content"]
 
+    def test_length_continuation_preserves_large_provider_default_output_cap(self, agent):
+        """Continuation retries must not shrink a higher provider default cap."""
+        self._setup_agent(agent)
+        agent.max_tokens = None
+        requested_caps = []
+
+        def _fake_build_api_kwargs(api_messages):
+            ephemeral = getattr(agent, "_ephemeral_max_output_tokens", None)
+            if ephemeral is not None:
+                agent._ephemeral_max_output_tokens = None
+            cap = ephemeral if ephemeral is not None else 65536
+            requested_caps.append(cap)
+            return {"model": agent.model, "messages": api_messages, "max_tokens": cap}
+
+        first = _mock_response(content="Part 1 ", finish_reason="length")
+        second = _mock_response(content="Part 2", finish_reason="stop")
+        agent.client.chat.completions.create.side_effect = [first, second]
+
+        with (
+            patch.object(agent, "_build_api_kwargs", side_effect=_fake_build_api_kwargs),
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+        ):
+            result = agent.run_conversation("hello")
+
+        assert result["completed"] is True
+        assert result["final_response"] == "Part 1 Part 2"
+        assert requested_caps == [65536, 65536]
+
     def test_ollama_glm_stop_after_tools_without_terminal_boundary_requests_continuation(self, agent):
         """Ollama-hosted GLM responses can misreport truncated output as stop."""
         self._setup_agent(agent)
@@ -3877,7 +3907,8 @@ class TestRunConversation:
 
     def test_truncated_tool_call_retries_once_before_refusing(self, agent):
         """When tool call args are truncated, the agent retries the API call
-        once. If the retry succeeds (valid JSON args), tool execution proceeds."""
+        (up to 3 times). If a retry succeeds (valid JSON args), tool execution
+        proceeds."""
         self._setup_agent(agent)
         agent.valid_tool_names.add("write_file")
         bad_tc = _mock_tool_call(
@@ -3914,6 +3945,48 @@ class TestRunConversation:
         mock_hfc.assert_called_once()
         assert result["final_response"] == "Done!"
 
+    def test_stub_stall_mid_tool_call_recovers_within_3_retries(self, agent):
+        """A network stream stall mid tool-call (PARTIAL_STREAM_STUB_ID) must
+        retry up to 3 times rather than hard-failing after one — and recover
+        if a retry produces a complete tool call. Regression for the false
+        'model hit max output tokens' on Opus when the stream simply dropped."""
+        from hermes_constants import PARTIAL_STREAM_STUB_ID
+
+        self._setup_agent(agent)
+        agent.valid_tool_names.add("write_file")
+        bad_tc = _mock_tool_call(
+            name="write_file",
+            arguments='{"path":"report.md","content":"partial',
+            call_id="c1",
+        )
+        # Two consecutive stub-stall responses, then a clean tool call.
+        stall1 = _mock_response(content="", finish_reason="length", tool_calls=[bad_tc])
+        stall1.id = PARTIAL_STREAM_STUB_ID
+        stall2 = _mock_response(content="", finish_reason="length", tool_calls=[bad_tc])
+        stall2.id = PARTIAL_STREAM_STUB_ID
+        good_tc = _mock_tool_call(
+            name="write_file",
+            arguments='{"path":"report.md","content":"full content"}',
+            call_id="c2",
+        )
+        good_resp = _mock_response(content="", finish_reason="stop", tool_calls=[good_tc])
+        final_resp = _mock_response(content="Done!", finish_reason="stop")
+
+        with (
+            patch("run_agent.handle_function_call", return_value='{"success":true}') as mock_hfc,
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+        ):
+            agent.client.chat.completions.create.side_effect = [
+                stall1, stall2, good_resp, final_resp,
+            ]
+            result = agent.run_conversation("write the report")
+
+        # Recovered on the 3rd attempt instead of refusing after the 1st.
+        mock_hfc.assert_called_once()
+        assert result["final_response"] == "Done!"
+
     def test_truncated_tool_args_detected_when_finish_reason_not_length(self, agent):
         """When a router rewrites finish_reason from 'length' to 'tool_calls',
         truncated JSON arguments should still be detected and refused rather