From c9094f5e5fcf579afe6870817c02d79821ec15fd Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Mon, 8 Jun 2026 11:56:10 -0700
Subject: [PATCH] fix(stream): don't report dropped mid-tool-call streams as
 output truncation (#42314)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix(stream): don't report dropped mid-tool-call streams as output truncation

A streaming tool call whose SSE ends with no finish_reason (the upstream
delivers the tool name + opening '{' then closes the connection cleanly,
no terminator, no [DONE]) was stamped finish_reason='length' by the mock
builder. That routed it through the output-cap truncation path: 3 useless
max_tokens-boosted retries, then the misleading 'Response truncated due to
output length limit' error — even though the model never reported hitting
any cap.

Reproduced live on nvidia/nemotron-3-ultra:free via the Nous dedicated
endpoint, which stalls/drops during large tool-arg generation (50s-4m41s).

Now: when tool args are incomplete AND the provider sent no finish_reason,
tag the response as a partial-stream stub so the loop reports an honest
mid-tool-call drop and asks the model to chunk its output (existing
continuation machinery), instead of escalating output budget and lying.
A provider-reported finish_reason='length' still takes the real-truncation
path unchanged.

* test(stream): update truncated-tool-args test for drop-vs-cap split

test_truncated_tool_call_args_upgrade_finish_reason_to_length pinned the
old behaviour where ANY incomplete tool args → finish_reason='length' with
tool_calls preserved. That single-chunk-no-finish_reason scenario is exactly
the mid-tool-call stream drop now reclassified as a partial-stream stub.

Split into two tests matching the new contract:
- no finish_reason + incomplete args → PARTIAL_STREAM_STUB_ID, tool_calls=None,
  _dropped_tool_names set (the drop path)
- explicit finish_reason='length' + incomplete args → tool_calls preserved,
  'length' upgrade unchanged (the genuine output-cap path)
---
 agent/chat_completion_helpers.py              | 52 ++++++++++
 .../test_partial_stream_finish_reason.py      | 95 +++++++++++++++++++
 tests/run_agent/test_run_agent.py             | 28 +++++-
 3 files changed, 174 insertions(+), 1 deletion(-)

diff --git a/agent/chat_completion_helpers.py b/agent/chat_completion_helpers.py
index 3f483789ede..ce066d55640 100644
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
@@ -1986,6 +1986,58 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                 "(possible upstream error or malformed SSE response)."
             )
 
+        # A stream that delivered a tool call but only partial/unparseable
+        # JSON args splits into two very different cases:
+        #
+        #   1. Provider sent finish_reason="length" → a genuine output-cap
+        #      truncation.  Boosting max_tokens on retry is the right move.
+        #
+        #   2. Provider sent NO finish_reason (the SSE simply stopped after
+        #      the opening "{" with no terminator and no [DONE]) → the
+        #      upstream dropped/stalled the connection mid tool-call.  This
+        #      is NOT an output cap — the model never reported hitting one.
+        #      Some dedicated endpoints (e.g. NVIDIA Nemotron Ultra on the
+        #      Nous dedicated endpoint) stall for minutes during large
+        #      tool-arg generation, then close the stream cleanly without a
+        #      finish_reason.  Stamping "length" here sends it down the
+        #      max_tokens-boost truncation path, which retries 3× to no
+        #      effect and finally reports the misleading "Response truncated
+        #      due to output length limit" — the red herring this guards
+        #      against.  Route it through the partial-stream-stub path
+        #      instead so the loop reports an honest mid-tool-call stream
+        #      drop and fails fast rather than escalating output budget.
+        _tool_args_dropped_no_finish = has_truncated_tool_args and finish_reason is None
+        if _tool_args_dropped_no_finish:
+            _dropped_names = [
+                (tool_calls_acc[idx]["function"]["name"] or "?")
+                for idx in sorted(tool_calls_acc)
+            ]
+            logger.warning(
+                "Stream ended with no finish_reason while a tool call's "
+                "arguments were still incomplete (tools=%s); treating as a "
+                "mid-tool-call stream drop, not an output-length truncation.",
+                _dropped_names,
+            )
+            full_reasoning = "".join(reasoning_parts) or None
+            mock_message = SimpleNamespace(
+                role=role,
+                content=full_content,
+                tool_calls=None,
+                reasoning_content=full_reasoning,
+            )
+            mock_choice = SimpleNamespace(
+                index=0,
+                message=mock_message,
+                finish_reason=FINISH_REASON_LENGTH,
+            )
+            return SimpleNamespace(
+                id=PARTIAL_STREAM_STUB_ID,
+                model=model_name,
+                choices=[mock_choice],
+                usage=usage_obj,
+                _dropped_tool_names=_dropped_names or None,
+            )
+
         effective_finish_reason = finish_reason or "stop"
         if has_truncated_tool_args:
             effective_finish_reason = "length"
diff --git a/tests/run_agent/test_partial_stream_finish_reason.py b/tests/run_agent/test_partial_stream_finish_reason.py
index 77aea3353e2..80474a97310 100644
--- a/tests/run_agent/test_partial_stream_finish_reason.py
+++ b/tests/run_agent/test_partial_stream_finish_reason.py
@@ -136,6 +136,101 @@ class TestPartialStreamStubFinishReason:
         assert "write_file" in content
 
 
+# ── Clean stream-end mid-tool-call (no exception, no finish_reason) ─────────
+
+class TestCleanStreamEndMidToolCall:
+    """The upstream closes the SSE stream cleanly after delivering a tool
+    name + the opening '{' of its arguments — NO exception, NO finish_reason,
+    NO [DONE].  Observed live on NVIDIA Nemotron Ultra via the Nous dedicated
+    endpoint: it stalls/drops during large tool-arg generation.
+
+    The mock-builder must NOT stamp this as finish_reason='length' (which
+    routes it through the max_tokens-boost truncation path and finally
+    reports the misleading 'Response truncated due to output length limit').
+    It must route through the partial-stream-stub path so the loop reports
+    an honest mid-tool-call drop and asks the model to chunk its output.
+    """
+
+    @patch("run_agent.AIAgent._create_request_openai_client")
+    @patch("run_agent.AIAgent._close_request_openai_client")
+    def test_no_finish_reason_partial_tool_args_routes_to_stub(
+        self, _mock_close, mock_create, monkeypatch,
+    ):
+        def _clean_ending_stream():
+            # Reasoning + tool name + the lone opening brace, then the
+            # generator simply RETURNS (StopIteration) — no raise, no
+            # finish_reason chunk, no [DONE].
+            yield _make_stream_chunk(content="\n")
+            yield _make_stream_chunk(tool_calls=[
+                _make_tool_call_delta(index=0, tc_id="call_x", name="execute_code"),
+            ])
+            yield _make_stream_chunk(tool_calls=[
+                _make_tool_call_delta(index=0, arguments="{"),
+            ])
+            # falls off the end — clean close, no terminator
+
+        mock_client = MagicMock()
+        mock_client.chat.completions.create.side_effect = (
+            lambda *a, **kw: _clean_ending_stream()
+        )
+        mock_create.return_value = mock_client
+
+        agent = _make_agent()
+        agent._fire_stream_delta = lambda text: None
+
+        response = agent._interruptible_streaming_api_call({})
+
+        assert response.id == PARTIAL_STREAM_STUB_ID, (
+            "A clean stream-end mid tool-call (no finish_reason) must be "
+            "tagged as a partial-stream stub, not a 'stream-<uuid>' "
+            "truncation — otherwise the loop reports the false 'output "
+            "length limit' error."
+        )
+        assert response.choices[0].finish_reason == FINISH_REASON_LENGTH
+        assert response.choices[0].message.tool_calls is None, (
+            "Incomplete tool args must never auto-execute."
+        )
+        assert getattr(response, "_dropped_tool_names", None) == ["execute_code"]
+
+    @patch("run_agent.AIAgent._create_request_openai_client")
+    @patch("run_agent.AIAgent._close_request_openai_client")
+    def test_real_length_truncation_still_uses_uuid_id(
+        self, _mock_close, mock_create, monkeypatch,
+    ):
+        """Control: when the provider DOES send finish_reason='length' with
+        partial tool args, it is a genuine output cap — keep the existing
+        non-stub behaviour (boost max_tokens and retry)."""
+
+        def _capped_stream():
+            yield _make_stream_chunk(tool_calls=[
+                _make_tool_call_delta(index=0, tc_id="call_y", name="execute_code"),
+            ])
+            yield _make_stream_chunk(tool_calls=[
+                _make_tool_call_delta(index=0, arguments="{"),
+            ])
+            # Provider explicitly reports the output cap.
+            yield _make_stream_chunk(finish_reason="length")
+
+        mock_client = MagicMock()
+        mock_client.chat.completions.create.side_effect = (
+            lambda *a, **kw: _capped_stream()
+        )
+        mock_create.return_value = mock_client
+
+        agent = _make_agent()
+        agent._fire_stream_delta = lambda text: None
+
+        response = agent._interruptible_streaming_api_call({})
+
+        assert response.id != PARTIAL_STREAM_STUB_ID, (
+            "A provider-reported finish_reason='length' is a real output cap "
+            "and must keep the existing truncation path, not the stream-drop "
+            "stub path."
+        )
+        assert response.id.startswith("stream-")
+        assert response.choices[0].finish_reason == FINISH_REASON_LENGTH
+
+
 # ── Length-continuation prompt branching ──────────────────────────────────
 
 class TestLengthContinuationPromptBranching:
diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py
index 884f9995ac1..72363176d61 100644
--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@@ -5788,7 +5788,15 @@ class TestStreamingApiCall:
         assert tc[0].function.name == "search"
         assert tc[1].function.name == "read"
 
-    def test_truncated_tool_call_args_upgrade_finish_reason_to_length(self, agent):
+    def test_truncated_tool_call_args_no_finish_reason_routes_to_stub(self, agent):
+        # Stream delivers a tool call with incomplete JSON args and then ENDS
+        # with no finish_reason (the SSE just stops — no terminator, no
+        # [DONE]).  This is an upstream mid-tool-call drop, NOT an output cap.
+        # The builder must route it through the partial-stream-stub path
+        # (id=PARTIAL_STREAM_STUB_ID, tool_calls=None so it can't execute,
+        # finish_reason=length so the loop's continuation machinery fires with
+        # chunking guidance) rather than stamping a normal 'length' truncation.
+        from hermes_constants import PARTIAL_STREAM_STUB_ID
         chunks = [
             _make_chunk(tool_calls=[_make_tc_delta(0, "call_1", "write_file", '{"path":"x.txt","content":"hel')]),
         ]
@@ -5796,6 +5804,24 @@ class TestStreamingApiCall:
 
         resp = agent._interruptible_streaming_api_call({"messages": []})
 
+        assert resp.id == PARTIAL_STREAM_STUB_ID
+        assert resp.choices[0].finish_reason == "length"
+        assert resp.choices[0].message.tool_calls is None
+        assert getattr(resp, "_dropped_tool_names", None) == ["write_file"]
+
+    def test_truncated_tool_call_args_with_length_finish_reason_upgrades(self, agent):
+        # Control: when the provider explicitly reports finish_reason='length'
+        # alongside incomplete tool args, it IS a genuine output cap.  Keep the
+        # existing behaviour — tool_calls preserved, finish_reason 'length' —
+        # so the max_tokens-boost truncation retry path still applies.
+        chunks = [
+            _make_chunk(tool_calls=[_make_tc_delta(0, "call_1", "write_file", '{"path":"x.txt","content":"hel')]),
+            _make_chunk(finish_reason="length"),
+        ]
+        agent.client.chat.completions.create.return_value = iter(chunks)
+
+        resp = agent._interruptible_streaming_api_call({"messages": []})
+
         tc = resp.choices[0].message.tool_calls
         assert len(tc) == 1
         assert tc[0].function.name == "write_file"