fix(agent): stop reporting broken streams as output-length truncation (#36705)

A stream that drops mid-response after tokens are delivered (peer-closed
connection, stale-stream reconnect) is converted into a synthetic
finish_reason="length" stub. The conversation loop treated that network
stall as a max-output-tokens truncation: when the dropped content was a
tool call it retried exactly once, then hard-failed with "Response
truncated due to output length limit" — even on large-output models that
never hit any cap (e.g. Opus).

- Tool-call truncation now retries up to 3 times (was 1) with a
  progressive max_tokens boost, and is stub-aware: a PARTIAL_STREAM_STUB_ID
  stall prints "Stream interrupted mid tool-call — retrying (n/3)" instead
  of the false "model hit max output tokens", and the give-up message
  distinguishes a network drop from a real truncation.
- Length-continuation retries preserve the original request's output cap
  as a floor, so a high provider/model default isn't silently downshifted
  to 8K/12K on retry.
- Added _requested_output_cap_from_api_kwargs() helper.

Tests: stub-stall mid-tool-call recovery within 3 retries; continuation
preserves a large provider-default output cap.

Fixes #26425. Salvages the substance of #26427 (cap floor) and #9525
(retry bump), adapted to the post-refactor conversation_loop.py which
handles all three api_modes uniformly.

Co-authored-by: LeonSGP43 <cine.dreamer.one@gmail.com>
Co-authored-by: ygd58 <ygd58@users.noreply.github.com>
This commit is contained in:
Teknium 2026-06-01 03:01:20 -07:00 committed by GitHub
parent b571ec298d
commit 023149f665
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 143 additions and 11 deletions

View file

@ -1739,20 +1739,52 @@ def run_conversation(
if agent.api_mode in {"chat_completions", "bedrock_converse", "anthropic_messages"}:
assistant_message = _trunc_msg
if assistant_message is not None and _trunc_has_tool_calls:
if truncated_tool_call_retries < 1:
_is_stub_stall = (
getattr(response, "id", "") == PARTIAL_STREAM_STUB_ID
)
if truncated_tool_call_retries < 3:
truncated_tool_call_retries += 1
agent._buffer_vprint(
f"⚠️ Truncated tool call detected — retrying API call..."
)
if _is_stub_stall:
# The stream broke mid tool-call (network /
# peer-closed connection), not a real output
# cap — say so instead of "max output tokens".
agent._buffer_vprint(
f"⚠️ Stream interrupted mid tool-call — "
f"retrying ({truncated_tool_call_retries}/3)..."
)
else:
agent._buffer_vprint(
f"⚠️ Truncated tool call detected — "
f"retrying API call "
f"({truncated_tool_call_retries}/3)..."
)
# Boost max_tokens on each retry so the model has
# more room to complete the tool-call JSON. A
# network stall doesn't need a bigger budget, but
# a genuine output-cap truncation does, and the
# boost is harmless for the stall case.
_tc_boost_base = agent.max_tokens if agent.max_tokens else 4096
_tc_boost = _tc_boost_base * (truncated_tool_call_retries + 1)
_tc_requested_cap = agent._requested_output_cap_from_api_kwargs(api_kwargs)
if _tc_requested_cap is not None:
_tc_boost = max(_tc_boost, _tc_requested_cap)
_tc_boost_cap = max(32768, _tc_requested_cap or 0)
agent._ephemeral_max_output_tokens = min(_tc_boost, _tc_boost_cap)
# Don't append the broken response to messages;
# just re-run the same API call from the current
# message state, giving the model another chance.
continue
agent._flush_status_buffer()
agent._vprint(
f"{agent.log_prefix}⚠️ Truncated tool call response detected again — refusing to execute incomplete tool arguments.",
force=True,
)
if _is_stub_stall:
agent._vprint(
f"{agent.log_prefix}⚠️ Stream kept dropping mid tool-call after 3 retries — the action was not executed.",
force=True,
)
else:
agent._vprint(
f"{agent.log_prefix}⚠️ Truncated tool call response detected again — refusing to execute incomplete tool arguments.",
force=True,
)
agent._cleanup_task_resources(effective_task_id)
agent._persist_session(messages, conversation_history)
return {
@ -1761,7 +1793,12 @@ def run_conversation(
"api_calls": api_call_count,
"completed": False,
"partial": True,
"error": "Response truncated due to output length limit",
"error": (
"Stream repeatedly dropped mid tool-call (network); "
"the tool was not executed"
if _is_stub_stall
else "Response truncated due to output length limit"
),
}
# If we have prior messages, roll back to last complete state
@ -3412,9 +3449,16 @@ def run_conversation(
# Progressively boost the output token budget on each retry.
# Retry 1 → 2× base, retry 2 → 3× base, capped at 32 768.
# Applies to all providers via _ephemeral_max_output_tokens.
# If the original request already used a larger provider/model
# default budget, keep that floor so continuation retries do
# not accidentally downshift to a much smaller cap.
_boost_base = agent.max_tokens if agent.max_tokens else 4096
_boost = _boost_base * (length_continue_retries + 1)
agent._ephemeral_max_output_tokens = min(_boost, 32768)
_requested_cap = agent._requested_output_cap_from_api_kwargs(api_kwargs)
if _requested_cap is not None:
_boost = max(_boost, _requested_cap)
_boost_cap = max(32768, _requested_cap or 0)
agent._ephemeral_max_output_tokens = min(_boost, _boost_cap)
continue
# Guard: if all retries exhausted without a successful response

View file

@ -1206,6 +1206,21 @@ class AIAgent:
return {"max_completion_tokens": value}
return {"max_tokens": value}
@staticmethod
def _requested_output_cap_from_api_kwargs(api_kwargs: Any) -> Optional[int]:
"""Extract the outgoing response token cap from a prepared request."""
if not isinstance(api_kwargs, dict):
return None
for key in ("max_output_tokens", "max_completion_tokens", "max_tokens"):
raw = api_kwargs.get(key)
try:
value = int(raw)
except (TypeError, ValueError):
continue
if value > 0:
return value
return None
def _has_content_after_think_block(self, content: str) -> bool:
"""
Check if content has actual text after any reasoning/thinking blocks.

View file

@ -3699,6 +3699,36 @@ class TestRunConversation:
assert second_call_messages[-1]["role"] == "user"
assert "truncated by the output length limit" in second_call_messages[-1]["content"]
def test_length_continuation_preserves_large_provider_default_output_cap(self, agent):
"""Continuation retries must not shrink a higher provider default cap."""
self._setup_agent(agent)
agent.max_tokens = None
requested_caps = []
def _fake_build_api_kwargs(api_messages):
ephemeral = getattr(agent, "_ephemeral_max_output_tokens", None)
if ephemeral is not None:
agent._ephemeral_max_output_tokens = None
cap = ephemeral if ephemeral is not None else 65536
requested_caps.append(cap)
return {"model": agent.model, "messages": api_messages, "max_tokens": cap}
first = _mock_response(content="Part 1 ", finish_reason="length")
second = _mock_response(content="Part 2", finish_reason="stop")
agent.client.chat.completions.create.side_effect = [first, second]
with (
patch.object(agent, "_build_api_kwargs", side_effect=_fake_build_api_kwargs),
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
result = agent.run_conversation("hello")
assert result["completed"] is True
assert result["final_response"] == "Part 1 Part 2"
assert requested_caps == [65536, 65536]
def test_ollama_glm_stop_after_tools_without_terminal_boundary_requests_continuation(self, agent):
"""Ollama-hosted GLM responses can misreport truncated output as stop."""
self._setup_agent(agent)
@ -3877,7 +3907,8 @@ class TestRunConversation:
def test_truncated_tool_call_retries_once_before_refusing(self, agent):
"""When tool call args are truncated, the agent retries the API call
once. If the retry succeeds (valid JSON args), tool execution proceeds."""
(up to 3 times). If a retry succeeds (valid JSON args), tool execution
proceeds."""
self._setup_agent(agent)
agent.valid_tool_names.add("write_file")
bad_tc = _mock_tool_call(
@ -3914,6 +3945,48 @@ class TestRunConversation:
mock_hfc.assert_called_once()
assert result["final_response"] == "Done!"
def test_stub_stall_mid_tool_call_recovers_within_3_retries(self, agent):
"""A network stream stall mid tool-call (PARTIAL_STREAM_STUB_ID) must
retry up to 3 times rather than hard-failing after one and recover
if a retry produces a complete tool call. Regression for the false
'model hit max output tokens' on Opus when the stream simply dropped."""
from hermes_constants import PARTIAL_STREAM_STUB_ID
self._setup_agent(agent)
agent.valid_tool_names.add("write_file")
bad_tc = _mock_tool_call(
name="write_file",
arguments='{"path":"report.md","content":"partial',
call_id="c1",
)
# Two consecutive stub-stall responses, then a clean tool call.
stall1 = _mock_response(content="", finish_reason="length", tool_calls=[bad_tc])
stall1.id = PARTIAL_STREAM_STUB_ID
stall2 = _mock_response(content="", finish_reason="length", tool_calls=[bad_tc])
stall2.id = PARTIAL_STREAM_STUB_ID
good_tc = _mock_tool_call(
name="write_file",
arguments='{"path":"report.md","content":"full content"}',
call_id="c2",
)
good_resp = _mock_response(content="", finish_reason="stop", tool_calls=[good_tc])
final_resp = _mock_response(content="Done!", finish_reason="stop")
with (
patch("run_agent.handle_function_call", return_value='{"success":true}') as mock_hfc,
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
agent.client.chat.completions.create.side_effect = [
stall1, stall2, good_resp, final_resp,
]
result = agent.run_conversation("write the report")
# Recovered on the 3rd attempt instead of refusing after the 1st.
mock_hfc.assert_called_once()
assert result["final_response"] == "Done!"
def test_truncated_tool_args_detected_when_finish_reason_not_length(self, agent):
"""When a router rewrites finish_reason from 'length' to 'tool_calls',
truncated JSON arguments should still be detected and refused rather