fix(stream): don't report dropped mid-tool-call streams as output truncation (#42314)

* fix(stream): don't report dropped mid-tool-call streams as output truncation A streaming tool call whose SSE ends with no finish_reason (the upstream delivers the tool name + opening '{' then closes the connection cleanly, no terminator, no [DONE]) was stamped finish_reason='length' by the mock builder. That routed it through the output-cap truncation path: 3 useless max_tokens-boosted retries, then the misleading 'Response truncated due to output length limit' error — even though the model never reported hitting any cap. Reproduced live on nvidia/nemotron-3-ultra:free via the Nous dedicated endpoint, which stalls/drops during large tool-arg generation (50s-4m41s). Now: when tool args are incomplete AND the provider sent no finish_reason, tag the response as a partial-stream stub so the loop reports an honest mid-tool-call drop and asks the model to chunk its output (existing continuation machinery), instead of escalating output budget and lying. A provider-reported finish_reason='length' still takes the real-truncation path unchanged. * test(stream): update truncated-tool-args test for drop-vs-cap split test_truncated_tool_call_args_upgrade_finish_reason_to_length pinned the old behaviour where ANY incomplete tool args → finish_reason='length' with tool_calls preserved. That single-chunk-no-finish_reason scenario is exactly the mid-tool-call stream drop now reclassified as a partial-stream stub. Split into two tests matching the new contract: - no finish_reason + incomplete args → PARTIAL_STREAM_STUB_ID, tool_calls=None, _dropped_tool_names set (the drop path) - explicit finish_reason='length' + incomplete args → tool_calls preserved, 'length' upgrade unchanged (the genuine output-cap path)
2026-06-09 08:21:50 +00:00 · 2026-06-08 11:56:10 -07:00 · 2026-06-08 11:56:10 -07:00 · c9094f5e5f
commit c9094f5e5f
parent 89d380261d
3 changed files with 174 additions and 1 deletions
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
@ -1986,6 +1986,58 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                "(possible upstream error or malformed SSE response)."
            )

+        # A stream that delivered a tool call but only partial/unparseable
+        # JSON args splits into two very different cases:
+        #
+        #   1. Provider sent finish_reason="length" → a genuine output-cap
+        #      truncation.  Boosting max_tokens on retry is the right move.
+        #
+        #   2. Provider sent NO finish_reason (the SSE simply stopped after
+        #      the opening "{" with no terminator and no [DONE]) → the
+        #      upstream dropped/stalled the connection mid tool-call.  This
+        #      is NOT an output cap — the model never reported hitting one.
+        #      Some dedicated endpoints (e.g. NVIDIA Nemotron Ultra on the
+        #      Nous dedicated endpoint) stall for minutes during large
+        #      tool-arg generation, then close the stream cleanly without a
+        #      finish_reason.  Stamping "length" here sends it down the
+        #      max_tokens-boost truncation path, which retries 3× to no
+        #      effect and finally reports the misleading "Response truncated
+        #      due to output length limit" — the red herring this guards
+        #      against.  Route it through the partial-stream-stub path
+        #      instead so the loop reports an honest mid-tool-call stream
+        #      drop and fails fast rather than escalating output budget.
+        _tool_args_dropped_no_finish = has_truncated_tool_args and finish_reason is None
+        if _tool_args_dropped_no_finish:
+            _dropped_names = [
+                (tool_calls_acc[idx]["function"]["name"] or "?")
+                for idx in sorted(tool_calls_acc)
+            ]
+            logger.warning(
+                "Stream ended with no finish_reason while a tool call's "
+                "arguments were still incomplete (tools=%s); treating as a "
+                "mid-tool-call stream drop, not an output-length truncation.",
+                _dropped_names,
+            )
+            full_reasoning = "".join(reasoning_parts) or None
+            mock_message = SimpleNamespace(
+                role=role,
+                content=full_content,
+                tool_calls=None,
+                reasoning_content=full_reasoning,
+            )
+            mock_choice = SimpleNamespace(
+                index=0,
+                message=mock_message,
+                finish_reason=FINISH_REASON_LENGTH,
+            )
+            return SimpleNamespace(
+                id=PARTIAL_STREAM_STUB_ID,
+                model=model_name,
+                choices=[mock_choice],
+                usage=usage_obj,
+                _dropped_tool_names=_dropped_names or None,
+            )
+
        effective_finish_reason = finish_reason or "stop"
        if has_truncated_tool_args:
            effective_finish_reason = "length"
--- a/tests/run_agent/test_partial_stream_finish_reason.py
+++ b/tests/run_agent/test_partial_stream_finish_reason.py
@ -136,6 +136,101 @@ class TestPartialStreamStubFinishReason:
        assert "write_file" in content


+# ── Clean stream-end mid-tool-call (no exception, no finish_reason) ─────────
+
+class TestCleanStreamEndMidToolCall:
+    """The upstream closes the SSE stream cleanly after delivering a tool
+    name + the opening '{' of its arguments — NO exception, NO finish_reason,
+    NO [DONE].  Observed live on NVIDIA Nemotron Ultra via the Nous dedicated
+    endpoint: it stalls/drops during large tool-arg generation.
+
+    The mock-builder must NOT stamp this as finish_reason='length' (which
+    routes it through the max_tokens-boost truncation path and finally
+    reports the misleading 'Response truncated due to output length limit').
+    It must route through the partial-stream-stub path so the loop reports
+    an honest mid-tool-call drop and asks the model to chunk its output.
+    """
+
+    @patch("run_agent.AIAgent._create_request_openai_client")
+    @patch("run_agent.AIAgent._close_request_openai_client")
+    def test_no_finish_reason_partial_tool_args_routes_to_stub(
+        self, _mock_close, mock_create, monkeypatch,
+    ):
+        def _clean_ending_stream():
+            # Reasoning + tool name + the lone opening brace, then the
+            # generator simply RETURNS (StopIteration) — no raise, no
+            # finish_reason chunk, no [DONE].
+            yield _make_stream_chunk(content="\n")
+            yield _make_stream_chunk(tool_calls=[
+                _make_tool_call_delta(index=0, tc_id="call_x", name="execute_code"),
+            ])
+            yield _make_stream_chunk(tool_calls=[
+                _make_tool_call_delta(index=0, arguments="{"),
+            ])
+            # falls off the end — clean close, no terminator
+
+        mock_client = MagicMock()
+        mock_client.chat.completions.create.side_effect = (
+            lambda *a, **kw: _clean_ending_stream()
+        )
+        mock_create.return_value = mock_client
+
+        agent = _make_agent()
+        agent._fire_stream_delta = lambda text: None
+
+        response = agent._interruptible_streaming_api_call({})
+
+        assert response.id == PARTIAL_STREAM_STUB_ID, (
+            "A clean stream-end mid tool-call (no finish_reason) must be "
+            "tagged as a partial-stream stub, not a 'stream-<uuid>' "
+            "truncation — otherwise the loop reports the false 'output "
+            "length limit' error."
+        )
+        assert response.choices[0].finish_reason == FINISH_REASON_LENGTH
+        assert response.choices[0].message.tool_calls is None, (
+            "Incomplete tool args must never auto-execute."
+        )
+        assert getattr(response, "_dropped_tool_names", None) == ["execute_code"]
+
+    @patch("run_agent.AIAgent._create_request_openai_client")
+    @patch("run_agent.AIAgent._close_request_openai_client")
+    def test_real_length_truncation_still_uses_uuid_id(
+        self, _mock_close, mock_create, monkeypatch,
+    ):
+        """Control: when the provider DOES send finish_reason='length' with
+        partial tool args, it is a genuine output cap — keep the existing
+        non-stub behaviour (boost max_tokens and retry)."""
+
+        def _capped_stream():
+            yield _make_stream_chunk(tool_calls=[
+                _make_tool_call_delta(index=0, tc_id="call_y", name="execute_code"),
+            ])
+            yield _make_stream_chunk(tool_calls=[
+                _make_tool_call_delta(index=0, arguments="{"),
+            ])
+            # Provider explicitly reports the output cap.
+            yield _make_stream_chunk(finish_reason="length")
+
+        mock_client = MagicMock()
+        mock_client.chat.completions.create.side_effect = (
+            lambda *a, **kw: _capped_stream()
+        )
+        mock_create.return_value = mock_client
+
+        agent = _make_agent()
+        agent._fire_stream_delta = lambda text: None
+
+        response = agent._interruptible_streaming_api_call({})
+
+        assert response.id != PARTIAL_STREAM_STUB_ID, (
+            "A provider-reported finish_reason='length' is a real output cap "
+            "and must keep the existing truncation path, not the stream-drop "
+            "stub path."
+        )
+        assert response.id.startswith("stream-")
+        assert response.choices[0].finish_reason == FINISH_REASON_LENGTH
+
+
 # ── Length-continuation prompt branching ──────────────────────────────────

 class TestLengthContinuationPromptBranching:
--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@ -5788,7 +5788,15 @@ class TestStreamingApiCall:
        assert tc[0].function.name == "search"
        assert tc[1].function.name == "read"

-    def test_truncated_tool_call_args_upgrade_finish_reason_to_length(self, agent):
+    def test_truncated_tool_call_args_no_finish_reason_routes_to_stub(self, agent):
+        # Stream delivers a tool call with incomplete JSON args and then ENDS
+        # with no finish_reason (the SSE just stops — no terminator, no
+        # [DONE]).  This is an upstream mid-tool-call drop, NOT an output cap.
+        # The builder must route it through the partial-stream-stub path
+        # (id=PARTIAL_STREAM_STUB_ID, tool_calls=None so it can't execute,
+        # finish_reason=length so the loop's continuation machinery fires with
+        # chunking guidance) rather than stamping a normal 'length' truncation.
+        from hermes_constants import PARTIAL_STREAM_STUB_ID
        chunks = [
            _make_chunk(tool_calls=[_make_tc_delta(0, "call_1", "write_file", '{"path":"x.txt","content":"hel')]),
        ]
@ -5796,6 +5804,24 @@ class TestStreamingApiCall:

        resp = agent._interruptible_streaming_api_call({"messages": []})

+        assert resp.id == PARTIAL_STREAM_STUB_ID
+        assert resp.choices[0].finish_reason == "length"
+        assert resp.choices[0].message.tool_calls is None
+        assert getattr(resp, "_dropped_tool_names", None) == ["write_file"]
+
+    def test_truncated_tool_call_args_with_length_finish_reason_upgrades(self, agent):
+        # Control: when the provider explicitly reports finish_reason='length'
+        # alongside incomplete tool args, it IS a genuine output cap.  Keep the
+        # existing behaviour — tool_calls preserved, finish_reason 'length' —
+        # so the max_tokens-boost truncation retry path still applies.
+        chunks = [
+            _make_chunk(tool_calls=[_make_tc_delta(0, "call_1", "write_file", '{"path":"x.txt","content":"hel')]),
+            _make_chunk(finish_reason="length"),
+        ]
+        agent.client.chat.completions.create.return_value = iter(chunks)
+
+        resp = agent._interruptible_streaming_api_call({"messages": []})
+
        tc = resp.choices[0].message.tool_calls
        assert len(tc) == 1
        assert tc[0].function.name == "write_file"