fix(stream): don't report dropped mid-tool-call streams as output truncation (#42314)

* fix(stream): don't report dropped mid-tool-call streams as output truncation

A streaming tool call whose SSE ends with no finish_reason (the upstream
delivers the tool name + opening '{' then closes the connection cleanly,
no terminator, no [DONE]) was stamped finish_reason='length' by the mock
builder. That routed it through the output-cap truncation path: 3 useless
max_tokens-boosted retries, then the misleading 'Response truncated due to
output length limit' error — even though the model never reported hitting
any cap.

Reproduced live on nvidia/nemotron-3-ultra:free via the Nous dedicated
endpoint, which stalls/drops during large tool-arg generation (50s-4m41s).

Now: when tool args are incomplete AND the provider sent no finish_reason,
tag the response as a partial-stream stub so the loop reports an honest
mid-tool-call drop and asks the model to chunk its output (existing
continuation machinery), instead of escalating output budget and lying.
A provider-reported finish_reason='length' still takes the real-truncation
path unchanged.

* test(stream): update truncated-tool-args test for drop-vs-cap split

test_truncated_tool_call_args_upgrade_finish_reason_to_length pinned the
old behaviour where ANY incomplete tool args → finish_reason='length' with
tool_calls preserved. That single-chunk-no-finish_reason scenario is exactly
the mid-tool-call stream drop now reclassified as a partial-stream stub.

Split into two tests matching the new contract:
- no finish_reason + incomplete args → PARTIAL_STREAM_STUB_ID, tool_calls=None,
  _dropped_tool_names set (the drop path)
- explicit finish_reason='length' + incomplete args → tool_calls preserved,
  'length' upgrade unchanged (the genuine output-cap path)
This commit is contained in:
Teknium 2026-06-08 11:56:10 -07:00 committed by GitHub
parent 89d380261d
commit c9094f5e5f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 174 additions and 1 deletions

View file

@ -1986,6 +1986,58 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
"(possible upstream error or malformed SSE response)."
)
# A stream that delivered a tool call but only partial/unparseable
# JSON args splits into two very different cases:
#
# 1. Provider sent finish_reason="length" → a genuine output-cap
# truncation. Boosting max_tokens on retry is the right move.
#
# 2. Provider sent NO finish_reason (the SSE simply stopped after
# the opening "{" with no terminator and no [DONE]) → the
# upstream dropped/stalled the connection mid tool-call. This
# is NOT an output cap — the model never reported hitting one.
# Some dedicated endpoints (e.g. NVIDIA Nemotron Ultra on the
# Nous dedicated endpoint) stall for minutes during large
# tool-arg generation, then close the stream cleanly without a
# finish_reason. Stamping "length" here sends it down the
# max_tokens-boost truncation path, which retries 3× to no
# effect and finally reports the misleading "Response truncated
# due to output length limit" — the red herring this guards
# against. Route it through the partial-stream-stub path
# instead so the loop reports an honest mid-tool-call stream
# drop and fails fast rather than escalating output budget.
_tool_args_dropped_no_finish = has_truncated_tool_args and finish_reason is None
if _tool_args_dropped_no_finish:
_dropped_names = [
(tool_calls_acc[idx]["function"]["name"] or "?")
for idx in sorted(tool_calls_acc)
]
logger.warning(
"Stream ended with no finish_reason while a tool call's "
"arguments were still incomplete (tools=%s); treating as a "
"mid-tool-call stream drop, not an output-length truncation.",
_dropped_names,
)
full_reasoning = "".join(reasoning_parts) or None
mock_message = SimpleNamespace(
role=role,
content=full_content,
tool_calls=None,
reasoning_content=full_reasoning,
)
mock_choice = SimpleNamespace(
index=0,
message=mock_message,
finish_reason=FINISH_REASON_LENGTH,
)
return SimpleNamespace(
id=PARTIAL_STREAM_STUB_ID,
model=model_name,
choices=[mock_choice],
usage=usage_obj,
_dropped_tool_names=_dropped_names or None,
)
effective_finish_reason = finish_reason or "stop"
if has_truncated_tool_args:
effective_finish_reason = "length"

View file

@ -136,6 +136,101 @@ class TestPartialStreamStubFinishReason:
assert "write_file" in content
# ── Clean stream-end mid-tool-call (no exception, no finish_reason) ─────────
class TestCleanStreamEndMidToolCall:
"""The upstream closes the SSE stream cleanly after delivering a tool
name + the opening '{' of its arguments NO exception, NO finish_reason,
NO [DONE]. Observed live on NVIDIA Nemotron Ultra via the Nous dedicated
endpoint: it stalls/drops during large tool-arg generation.
The mock-builder must NOT stamp this as finish_reason='length' (which
routes it through the max_tokens-boost truncation path and finally
reports the misleading 'Response truncated due to output length limit').
It must route through the partial-stream-stub path so the loop reports
an honest mid-tool-call drop and asks the model to chunk its output.
"""
@patch("run_agent.AIAgent._create_request_openai_client")
@patch("run_agent.AIAgent._close_request_openai_client")
def test_no_finish_reason_partial_tool_args_routes_to_stub(
self, _mock_close, mock_create, monkeypatch,
):
def _clean_ending_stream():
# Reasoning + tool name + the lone opening brace, then the
# generator simply RETURNS (StopIteration) — no raise, no
# finish_reason chunk, no [DONE].
yield _make_stream_chunk(content="\n")
yield _make_stream_chunk(tool_calls=[
_make_tool_call_delta(index=0, tc_id="call_x", name="execute_code"),
])
yield _make_stream_chunk(tool_calls=[
_make_tool_call_delta(index=0, arguments="{"),
])
# falls off the end — clean close, no terminator
mock_client = MagicMock()
mock_client.chat.completions.create.side_effect = (
lambda *a, **kw: _clean_ending_stream()
)
mock_create.return_value = mock_client
agent = _make_agent()
agent._fire_stream_delta = lambda text: None
response = agent._interruptible_streaming_api_call({})
assert response.id == PARTIAL_STREAM_STUB_ID, (
"A clean stream-end mid tool-call (no finish_reason) must be "
"tagged as a partial-stream stub, not a 'stream-<uuid>' "
"truncation — otherwise the loop reports the false 'output "
"length limit' error."
)
assert response.choices[0].finish_reason == FINISH_REASON_LENGTH
assert response.choices[0].message.tool_calls is None, (
"Incomplete tool args must never auto-execute."
)
assert getattr(response, "_dropped_tool_names", None) == ["execute_code"]
@patch("run_agent.AIAgent._create_request_openai_client")
@patch("run_agent.AIAgent._close_request_openai_client")
def test_real_length_truncation_still_uses_uuid_id(
self, _mock_close, mock_create, monkeypatch,
):
"""Control: when the provider DOES send finish_reason='length' with
partial tool args, it is a genuine output cap keep the existing
non-stub behaviour (boost max_tokens and retry)."""
def _capped_stream():
yield _make_stream_chunk(tool_calls=[
_make_tool_call_delta(index=0, tc_id="call_y", name="execute_code"),
])
yield _make_stream_chunk(tool_calls=[
_make_tool_call_delta(index=0, arguments="{"),
])
# Provider explicitly reports the output cap.
yield _make_stream_chunk(finish_reason="length")
mock_client = MagicMock()
mock_client.chat.completions.create.side_effect = (
lambda *a, **kw: _capped_stream()
)
mock_create.return_value = mock_client
agent = _make_agent()
agent._fire_stream_delta = lambda text: None
response = agent._interruptible_streaming_api_call({})
assert response.id != PARTIAL_STREAM_STUB_ID, (
"A provider-reported finish_reason='length' is a real output cap "
"and must keep the existing truncation path, not the stream-drop "
"stub path."
)
assert response.id.startswith("stream-")
assert response.choices[0].finish_reason == FINISH_REASON_LENGTH
# ── Length-continuation prompt branching ──────────────────────────────────
class TestLengthContinuationPromptBranching:

View file

@ -5788,7 +5788,15 @@ class TestStreamingApiCall:
assert tc[0].function.name == "search"
assert tc[1].function.name == "read"
def test_truncated_tool_call_args_upgrade_finish_reason_to_length(self, agent):
def test_truncated_tool_call_args_no_finish_reason_routes_to_stub(self, agent):
# Stream delivers a tool call with incomplete JSON args and then ENDS
# with no finish_reason (the SSE just stops — no terminator, no
# [DONE]). This is an upstream mid-tool-call drop, NOT an output cap.
# The builder must route it through the partial-stream-stub path
# (id=PARTIAL_STREAM_STUB_ID, tool_calls=None so it can't execute,
# finish_reason=length so the loop's continuation machinery fires with
# chunking guidance) rather than stamping a normal 'length' truncation.
from hermes_constants import PARTIAL_STREAM_STUB_ID
chunks = [
_make_chunk(tool_calls=[_make_tc_delta(0, "call_1", "write_file", '{"path":"x.txt","content":"hel')]),
]
@ -5796,6 +5804,24 @@ class TestStreamingApiCall:
resp = agent._interruptible_streaming_api_call({"messages": []})
assert resp.id == PARTIAL_STREAM_STUB_ID
assert resp.choices[0].finish_reason == "length"
assert resp.choices[0].message.tool_calls is None
assert getattr(resp, "_dropped_tool_names", None) == ["write_file"]
def test_truncated_tool_call_args_with_length_finish_reason_upgrades(self, agent):
# Control: when the provider explicitly reports finish_reason='length'
# alongside incomplete tool args, it IS a genuine output cap. Keep the
# existing behaviour — tool_calls preserved, finish_reason 'length' —
# so the max_tokens-boost truncation retry path still applies.
chunks = [
_make_chunk(tool_calls=[_make_tc_delta(0, "call_1", "write_file", '{"path":"x.txt","content":"hel')]),
_make_chunk(finish_reason="length"),
]
agent.client.chat.completions.create.return_value = iter(chunks)
resp = agent._interruptible_streaming_api_call({"messages": []})
tc = resp.choices[0].message.tool_calls
assert len(tc) == 1
assert tc[0].function.name == "write_file"