mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
fix(stream): don't report dropped mid-tool-call streams as output truncation (#42314)
* fix(stream): don't report dropped mid-tool-call streams as output truncation
A streaming tool call whose SSE ends with no finish_reason (the upstream
delivers the tool name + opening '{' then closes the connection cleanly,
no terminator, no [DONE]) was stamped finish_reason='length' by the mock
builder. That routed it through the output-cap truncation path: 3 useless
max_tokens-boosted retries, then the misleading 'Response truncated due to
output length limit' error — even though the model never reported hitting
any cap.
Reproduced live on nvidia/nemotron-3-ultra:free via the Nous dedicated
endpoint, which stalls/drops during large tool-arg generation (50s-4m41s).
Now: when tool args are incomplete AND the provider sent no finish_reason,
tag the response as a partial-stream stub so the loop reports an honest
mid-tool-call drop and asks the model to chunk its output (existing
continuation machinery), instead of escalating output budget and lying.
A provider-reported finish_reason='length' still takes the real-truncation
path unchanged.
* test(stream): update truncated-tool-args test for drop-vs-cap split
test_truncated_tool_call_args_upgrade_finish_reason_to_length pinned the
old behaviour where ANY incomplete tool args → finish_reason='length' with
tool_calls preserved. That single-chunk-no-finish_reason scenario is exactly
the mid-tool-call stream drop now reclassified as a partial-stream stub.
Split into two tests matching the new contract:
- no finish_reason + incomplete args → PARTIAL_STREAM_STUB_ID, tool_calls=None,
_dropped_tool_names set (the drop path)
- explicit finish_reason='length' + incomplete args → tool_calls preserved,
'length' upgrade unchanged (the genuine output-cap path)
This commit is contained in:
parent
89d380261d
commit
c9094f5e5f
3 changed files with 174 additions and 1 deletions
|
|
@ -1986,6 +1986,58 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
|
|||
"(possible upstream error or malformed SSE response)."
|
||||
)
|
||||
|
||||
# A stream that delivered a tool call but only partial/unparseable
|
||||
# JSON args splits into two very different cases:
|
||||
#
|
||||
# 1. Provider sent finish_reason="length" → a genuine output-cap
|
||||
# truncation. Boosting max_tokens on retry is the right move.
|
||||
#
|
||||
# 2. Provider sent NO finish_reason (the SSE simply stopped after
|
||||
# the opening "{" with no terminator and no [DONE]) → the
|
||||
# upstream dropped/stalled the connection mid tool-call. This
|
||||
# is NOT an output cap — the model never reported hitting one.
|
||||
# Some dedicated endpoints (e.g. NVIDIA Nemotron Ultra on the
|
||||
# Nous dedicated endpoint) stall for minutes during large
|
||||
# tool-arg generation, then close the stream cleanly without a
|
||||
# finish_reason. Stamping "length" here sends it down the
|
||||
# max_tokens-boost truncation path, which retries 3× to no
|
||||
# effect and finally reports the misleading "Response truncated
|
||||
# due to output length limit" — the red herring this guards
|
||||
# against. Route it through the partial-stream-stub path
|
||||
# instead so the loop reports an honest mid-tool-call stream
|
||||
# drop and fails fast rather than escalating output budget.
|
||||
_tool_args_dropped_no_finish = has_truncated_tool_args and finish_reason is None
|
||||
if _tool_args_dropped_no_finish:
|
||||
_dropped_names = [
|
||||
(tool_calls_acc[idx]["function"]["name"] or "?")
|
||||
for idx in sorted(tool_calls_acc)
|
||||
]
|
||||
logger.warning(
|
||||
"Stream ended with no finish_reason while a tool call's "
|
||||
"arguments were still incomplete (tools=%s); treating as a "
|
||||
"mid-tool-call stream drop, not an output-length truncation.",
|
||||
_dropped_names,
|
||||
)
|
||||
full_reasoning = "".join(reasoning_parts) or None
|
||||
mock_message = SimpleNamespace(
|
||||
role=role,
|
||||
content=full_content,
|
||||
tool_calls=None,
|
||||
reasoning_content=full_reasoning,
|
||||
)
|
||||
mock_choice = SimpleNamespace(
|
||||
index=0,
|
||||
message=mock_message,
|
||||
finish_reason=FINISH_REASON_LENGTH,
|
||||
)
|
||||
return SimpleNamespace(
|
||||
id=PARTIAL_STREAM_STUB_ID,
|
||||
model=model_name,
|
||||
choices=[mock_choice],
|
||||
usage=usage_obj,
|
||||
_dropped_tool_names=_dropped_names or None,
|
||||
)
|
||||
|
||||
effective_finish_reason = finish_reason or "stop"
|
||||
if has_truncated_tool_args:
|
||||
effective_finish_reason = "length"
|
||||
|
|
|
|||
|
|
@ -136,6 +136,101 @@ class TestPartialStreamStubFinishReason:
|
|||
assert "write_file" in content
|
||||
|
||||
|
||||
# ── Clean stream-end mid-tool-call (no exception, no finish_reason) ─────────
|
||||
|
||||
class TestCleanStreamEndMidToolCall:
|
||||
"""The upstream closes the SSE stream cleanly after delivering a tool
|
||||
name + the opening '{' of its arguments — NO exception, NO finish_reason,
|
||||
NO [DONE]. Observed live on NVIDIA Nemotron Ultra via the Nous dedicated
|
||||
endpoint: it stalls/drops during large tool-arg generation.
|
||||
|
||||
The mock-builder must NOT stamp this as finish_reason='length' (which
|
||||
routes it through the max_tokens-boost truncation path and finally
|
||||
reports the misleading 'Response truncated due to output length limit').
|
||||
It must route through the partial-stream-stub path so the loop reports
|
||||
an honest mid-tool-call drop and asks the model to chunk its output.
|
||||
"""
|
||||
|
||||
@patch("run_agent.AIAgent._create_request_openai_client")
|
||||
@patch("run_agent.AIAgent._close_request_openai_client")
|
||||
def test_no_finish_reason_partial_tool_args_routes_to_stub(
|
||||
self, _mock_close, mock_create, monkeypatch,
|
||||
):
|
||||
def _clean_ending_stream():
|
||||
# Reasoning + tool name + the lone opening brace, then the
|
||||
# generator simply RETURNS (StopIteration) — no raise, no
|
||||
# finish_reason chunk, no [DONE].
|
||||
yield _make_stream_chunk(content="\n")
|
||||
yield _make_stream_chunk(tool_calls=[
|
||||
_make_tool_call_delta(index=0, tc_id="call_x", name="execute_code"),
|
||||
])
|
||||
yield _make_stream_chunk(tool_calls=[
|
||||
_make_tool_call_delta(index=0, arguments="{"),
|
||||
])
|
||||
# falls off the end — clean close, no terminator
|
||||
|
||||
mock_client = MagicMock()
|
||||
mock_client.chat.completions.create.side_effect = (
|
||||
lambda *a, **kw: _clean_ending_stream()
|
||||
)
|
||||
mock_create.return_value = mock_client
|
||||
|
||||
agent = _make_agent()
|
||||
agent._fire_stream_delta = lambda text: None
|
||||
|
||||
response = agent._interruptible_streaming_api_call({})
|
||||
|
||||
assert response.id == PARTIAL_STREAM_STUB_ID, (
|
||||
"A clean stream-end mid tool-call (no finish_reason) must be "
|
||||
"tagged as a partial-stream stub, not a 'stream-<uuid>' "
|
||||
"truncation — otherwise the loop reports the false 'output "
|
||||
"length limit' error."
|
||||
)
|
||||
assert response.choices[0].finish_reason == FINISH_REASON_LENGTH
|
||||
assert response.choices[0].message.tool_calls is None, (
|
||||
"Incomplete tool args must never auto-execute."
|
||||
)
|
||||
assert getattr(response, "_dropped_tool_names", None) == ["execute_code"]
|
||||
|
||||
@patch("run_agent.AIAgent._create_request_openai_client")
|
||||
@patch("run_agent.AIAgent._close_request_openai_client")
|
||||
def test_real_length_truncation_still_uses_uuid_id(
|
||||
self, _mock_close, mock_create, monkeypatch,
|
||||
):
|
||||
"""Control: when the provider DOES send finish_reason='length' with
|
||||
partial tool args, it is a genuine output cap — keep the existing
|
||||
non-stub behaviour (boost max_tokens and retry)."""
|
||||
|
||||
def _capped_stream():
|
||||
yield _make_stream_chunk(tool_calls=[
|
||||
_make_tool_call_delta(index=0, tc_id="call_y", name="execute_code"),
|
||||
])
|
||||
yield _make_stream_chunk(tool_calls=[
|
||||
_make_tool_call_delta(index=0, arguments="{"),
|
||||
])
|
||||
# Provider explicitly reports the output cap.
|
||||
yield _make_stream_chunk(finish_reason="length")
|
||||
|
||||
mock_client = MagicMock()
|
||||
mock_client.chat.completions.create.side_effect = (
|
||||
lambda *a, **kw: _capped_stream()
|
||||
)
|
||||
mock_create.return_value = mock_client
|
||||
|
||||
agent = _make_agent()
|
||||
agent._fire_stream_delta = lambda text: None
|
||||
|
||||
response = agent._interruptible_streaming_api_call({})
|
||||
|
||||
assert response.id != PARTIAL_STREAM_STUB_ID, (
|
||||
"A provider-reported finish_reason='length' is a real output cap "
|
||||
"and must keep the existing truncation path, not the stream-drop "
|
||||
"stub path."
|
||||
)
|
||||
assert response.id.startswith("stream-")
|
||||
assert response.choices[0].finish_reason == FINISH_REASON_LENGTH
|
||||
|
||||
|
||||
# ── Length-continuation prompt branching ──────────────────────────────────
|
||||
|
||||
class TestLengthContinuationPromptBranching:
|
||||
|
|
|
|||
|
|
@ -5788,7 +5788,15 @@ class TestStreamingApiCall:
|
|||
assert tc[0].function.name == "search"
|
||||
assert tc[1].function.name == "read"
|
||||
|
||||
def test_truncated_tool_call_args_upgrade_finish_reason_to_length(self, agent):
|
||||
def test_truncated_tool_call_args_no_finish_reason_routes_to_stub(self, agent):
|
||||
# Stream delivers a tool call with incomplete JSON args and then ENDS
|
||||
# with no finish_reason (the SSE just stops — no terminator, no
|
||||
# [DONE]). This is an upstream mid-tool-call drop, NOT an output cap.
|
||||
# The builder must route it through the partial-stream-stub path
|
||||
# (id=PARTIAL_STREAM_STUB_ID, tool_calls=None so it can't execute,
|
||||
# finish_reason=length so the loop's continuation machinery fires with
|
||||
# chunking guidance) rather than stamping a normal 'length' truncation.
|
||||
from hermes_constants import PARTIAL_STREAM_STUB_ID
|
||||
chunks = [
|
||||
_make_chunk(tool_calls=[_make_tc_delta(0, "call_1", "write_file", '{"path":"x.txt","content":"hel')]),
|
||||
]
|
||||
|
|
@ -5796,6 +5804,24 @@ class TestStreamingApiCall:
|
|||
|
||||
resp = agent._interruptible_streaming_api_call({"messages": []})
|
||||
|
||||
assert resp.id == PARTIAL_STREAM_STUB_ID
|
||||
assert resp.choices[0].finish_reason == "length"
|
||||
assert resp.choices[0].message.tool_calls is None
|
||||
assert getattr(resp, "_dropped_tool_names", None) == ["write_file"]
|
||||
|
||||
def test_truncated_tool_call_args_with_length_finish_reason_upgrades(self, agent):
|
||||
# Control: when the provider explicitly reports finish_reason='length'
|
||||
# alongside incomplete tool args, it IS a genuine output cap. Keep the
|
||||
# existing behaviour — tool_calls preserved, finish_reason 'length' —
|
||||
# so the max_tokens-boost truncation retry path still applies.
|
||||
chunks = [
|
||||
_make_chunk(tool_calls=[_make_tc_delta(0, "call_1", "write_file", '{"path":"x.txt","content":"hel')]),
|
||||
_make_chunk(finish_reason="length"),
|
||||
]
|
||||
agent.client.chat.completions.create.return_value = iter(chunks)
|
||||
|
||||
resp = agent._interruptible_streaming_api_call({"messages": []})
|
||||
|
||||
tc = resp.choices[0].message.tool_calls
|
||||
assert len(tc) == 1
|
||||
assert tc[0].function.name == "write_file"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue