From c9094f5e5fcf579afe6870817c02d79821ec15fd Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 8 Jun 2026 11:56:10 -0700 Subject: [PATCH] fix(stream): don't report dropped mid-tool-call streams as output truncation (#42314) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(stream): don't report dropped mid-tool-call streams as output truncation A streaming tool call whose SSE ends with no finish_reason (the upstream delivers the tool name + opening '{' then closes the connection cleanly, no terminator, no [DONE]) was stamped finish_reason='length' by the mock builder. That routed it through the output-cap truncation path: 3 useless max_tokens-boosted retries, then the misleading 'Response truncated due to output length limit' error — even though the model never reported hitting any cap. Reproduced live on nvidia/nemotron-3-ultra:free via the Nous dedicated endpoint, which stalls/drops during large tool-arg generation (50s-4m41s). Now: when tool args are incomplete AND the provider sent no finish_reason, tag the response as a partial-stream stub so the loop reports an honest mid-tool-call drop and asks the model to chunk its output (existing continuation machinery), instead of escalating output budget and lying. A provider-reported finish_reason='length' still takes the real-truncation path unchanged. * test(stream): update truncated-tool-args test for drop-vs-cap split test_truncated_tool_call_args_upgrade_finish_reason_to_length pinned the old behaviour where ANY incomplete tool args → finish_reason='length' with tool_calls preserved. That single-chunk-no-finish_reason scenario is exactly the mid-tool-call stream drop now reclassified as a partial-stream stub. Split into two tests matching the new contract: - no finish_reason + incomplete args → PARTIAL_STREAM_STUB_ID, tool_calls=None, _dropped_tool_names set (the drop path) - explicit finish_reason='length' + incomplete args → tool_calls preserved, 'length' upgrade unchanged (the genuine output-cap path) --- agent/chat_completion_helpers.py | 52 ++++++++++ .../test_partial_stream_finish_reason.py | 95 +++++++++++++++++++ tests/run_agent/test_run_agent.py | 28 +++++- 3 files changed, 174 insertions(+), 1 deletion(-) diff --git a/agent/chat_completion_helpers.py b/agent/chat_completion_helpers.py index 3f483789ede..ce066d55640 100644 --- a/agent/chat_completion_helpers.py +++ b/agent/chat_completion_helpers.py @@ -1986,6 +1986,58 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta= "(possible upstream error or malformed SSE response)." ) + # A stream that delivered a tool call but only partial/unparseable + # JSON args splits into two very different cases: + # + # 1. Provider sent finish_reason="length" → a genuine output-cap + # truncation. Boosting max_tokens on retry is the right move. + # + # 2. Provider sent NO finish_reason (the SSE simply stopped after + # the opening "{" with no terminator and no [DONE]) → the + # upstream dropped/stalled the connection mid tool-call. This + # is NOT an output cap — the model never reported hitting one. + # Some dedicated endpoints (e.g. NVIDIA Nemotron Ultra on the + # Nous dedicated endpoint) stall for minutes during large + # tool-arg generation, then close the stream cleanly without a + # finish_reason. Stamping "length" here sends it down the + # max_tokens-boost truncation path, which retries 3× to no + # effect and finally reports the misleading "Response truncated + # due to output length limit" — the red herring this guards + # against. Route it through the partial-stream-stub path + # instead so the loop reports an honest mid-tool-call stream + # drop and fails fast rather than escalating output budget. + _tool_args_dropped_no_finish = has_truncated_tool_args and finish_reason is None + if _tool_args_dropped_no_finish: + _dropped_names = [ + (tool_calls_acc[idx]["function"]["name"] or "?") + for idx in sorted(tool_calls_acc) + ] + logger.warning( + "Stream ended with no finish_reason while a tool call's " + "arguments were still incomplete (tools=%s); treating as a " + "mid-tool-call stream drop, not an output-length truncation.", + _dropped_names, + ) + full_reasoning = "".join(reasoning_parts) or None + mock_message = SimpleNamespace( + role=role, + content=full_content, + tool_calls=None, + reasoning_content=full_reasoning, + ) + mock_choice = SimpleNamespace( + index=0, + message=mock_message, + finish_reason=FINISH_REASON_LENGTH, + ) + return SimpleNamespace( + id=PARTIAL_STREAM_STUB_ID, + model=model_name, + choices=[mock_choice], + usage=usage_obj, + _dropped_tool_names=_dropped_names or None, + ) + effective_finish_reason = finish_reason or "stop" if has_truncated_tool_args: effective_finish_reason = "length" diff --git a/tests/run_agent/test_partial_stream_finish_reason.py b/tests/run_agent/test_partial_stream_finish_reason.py index 77aea3353e2..80474a97310 100644 --- a/tests/run_agent/test_partial_stream_finish_reason.py +++ b/tests/run_agent/test_partial_stream_finish_reason.py @@ -136,6 +136,101 @@ class TestPartialStreamStubFinishReason: assert "write_file" in content +# ── Clean stream-end mid-tool-call (no exception, no finish_reason) ───────── + +class TestCleanStreamEndMidToolCall: + """The upstream closes the SSE stream cleanly after delivering a tool + name + the opening '{' of its arguments — NO exception, NO finish_reason, + NO [DONE]. Observed live on NVIDIA Nemotron Ultra via the Nous dedicated + endpoint: it stalls/drops during large tool-arg generation. + + The mock-builder must NOT stamp this as finish_reason='length' (which + routes it through the max_tokens-boost truncation path and finally + reports the misleading 'Response truncated due to output length limit'). + It must route through the partial-stream-stub path so the loop reports + an honest mid-tool-call drop and asks the model to chunk its output. + """ + + @patch("run_agent.AIAgent._create_request_openai_client") + @patch("run_agent.AIAgent._close_request_openai_client") + def test_no_finish_reason_partial_tool_args_routes_to_stub( + self, _mock_close, mock_create, monkeypatch, + ): + def _clean_ending_stream(): + # Reasoning + tool name + the lone opening brace, then the + # generator simply RETURNS (StopIteration) — no raise, no + # finish_reason chunk, no [DONE]. + yield _make_stream_chunk(content="\n") + yield _make_stream_chunk(tool_calls=[ + _make_tool_call_delta(index=0, tc_id="call_x", name="execute_code"), + ]) + yield _make_stream_chunk(tool_calls=[ + _make_tool_call_delta(index=0, arguments="{"), + ]) + # falls off the end — clean close, no terminator + + mock_client = MagicMock() + mock_client.chat.completions.create.side_effect = ( + lambda *a, **kw: _clean_ending_stream() + ) + mock_create.return_value = mock_client + + agent = _make_agent() + agent._fire_stream_delta = lambda text: None + + response = agent._interruptible_streaming_api_call({}) + + assert response.id == PARTIAL_STREAM_STUB_ID, ( + "A clean stream-end mid tool-call (no finish_reason) must be " + "tagged as a partial-stream stub, not a 'stream-' " + "truncation — otherwise the loop reports the false 'output " + "length limit' error." + ) + assert response.choices[0].finish_reason == FINISH_REASON_LENGTH + assert response.choices[0].message.tool_calls is None, ( + "Incomplete tool args must never auto-execute." + ) + assert getattr(response, "_dropped_tool_names", None) == ["execute_code"] + + @patch("run_agent.AIAgent._create_request_openai_client") + @patch("run_agent.AIAgent._close_request_openai_client") + def test_real_length_truncation_still_uses_uuid_id( + self, _mock_close, mock_create, monkeypatch, + ): + """Control: when the provider DOES send finish_reason='length' with + partial tool args, it is a genuine output cap — keep the existing + non-stub behaviour (boost max_tokens and retry).""" + + def _capped_stream(): + yield _make_stream_chunk(tool_calls=[ + _make_tool_call_delta(index=0, tc_id="call_y", name="execute_code"), + ]) + yield _make_stream_chunk(tool_calls=[ + _make_tool_call_delta(index=0, arguments="{"), + ]) + # Provider explicitly reports the output cap. + yield _make_stream_chunk(finish_reason="length") + + mock_client = MagicMock() + mock_client.chat.completions.create.side_effect = ( + lambda *a, **kw: _capped_stream() + ) + mock_create.return_value = mock_client + + agent = _make_agent() + agent._fire_stream_delta = lambda text: None + + response = agent._interruptible_streaming_api_call({}) + + assert response.id != PARTIAL_STREAM_STUB_ID, ( + "A provider-reported finish_reason='length' is a real output cap " + "and must keep the existing truncation path, not the stream-drop " + "stub path." + ) + assert response.id.startswith("stream-") + assert response.choices[0].finish_reason == FINISH_REASON_LENGTH + + # ── Length-continuation prompt branching ────────────────────────────────── class TestLengthContinuationPromptBranching: diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py index 884f9995ac1..72363176d61 100644 --- a/tests/run_agent/test_run_agent.py +++ b/tests/run_agent/test_run_agent.py @@ -5788,7 +5788,15 @@ class TestStreamingApiCall: assert tc[0].function.name == "search" assert tc[1].function.name == "read" - def test_truncated_tool_call_args_upgrade_finish_reason_to_length(self, agent): + def test_truncated_tool_call_args_no_finish_reason_routes_to_stub(self, agent): + # Stream delivers a tool call with incomplete JSON args and then ENDS + # with no finish_reason (the SSE just stops — no terminator, no + # [DONE]). This is an upstream mid-tool-call drop, NOT an output cap. + # The builder must route it through the partial-stream-stub path + # (id=PARTIAL_STREAM_STUB_ID, tool_calls=None so it can't execute, + # finish_reason=length so the loop's continuation machinery fires with + # chunking guidance) rather than stamping a normal 'length' truncation. + from hermes_constants import PARTIAL_STREAM_STUB_ID chunks = [ _make_chunk(tool_calls=[_make_tc_delta(0, "call_1", "write_file", '{"path":"x.txt","content":"hel')]), ] @@ -5796,6 +5804,24 @@ class TestStreamingApiCall: resp = agent._interruptible_streaming_api_call({"messages": []}) + assert resp.id == PARTIAL_STREAM_STUB_ID + assert resp.choices[0].finish_reason == "length" + assert resp.choices[0].message.tool_calls is None + assert getattr(resp, "_dropped_tool_names", None) == ["write_file"] + + def test_truncated_tool_call_args_with_length_finish_reason_upgrades(self, agent): + # Control: when the provider explicitly reports finish_reason='length' + # alongside incomplete tool args, it IS a genuine output cap. Keep the + # existing behaviour — tool_calls preserved, finish_reason 'length' — + # so the max_tokens-boost truncation retry path still applies. + chunks = [ + _make_chunk(tool_calls=[_make_tc_delta(0, "call_1", "write_file", '{"path":"x.txt","content":"hel')]), + _make_chunk(finish_reason="length"), + ] + agent.client.chat.completions.create.return_value = iter(chunks) + + resp = agent._interruptible_streaming_api_call({"messages": []}) + tc = resp.choices[0].message.tool_calls assert len(tc) == 1 assert tc[0].function.name == "write_file"