From 20b3703a42f238c6a3a717f6f9ec99301feca958 Mon Sep 17 00:00:00 2001 From: xxxigm Date: Sat, 23 May 2026 22:05:55 +0700 Subject: [PATCH] fix(conversation-loop): tailor length-continuation prompt for partial stream MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The length-continue path's user-facing vprint and continuation prompt both told the model "your response was truncated by the output length limit." That's a lie when the stub came from a partial-stream network error (issue #30963) — and a lie the model can detect, leading to "I wasn't truncated, I'm done" no-op responses that defeat the continuation entirely. Detect the partial-stream-stub via response.id and swap in: - vprint: "Stream interrupted by network error (finish_reason='length' on partial-stream-stub)" - prompt: "[System: The previous response was cut off by a network error mid-stream. Continue exactly where you left off. Do not restart or repeat prior text. Finish the answer directly.]" Real length truncations still see the original "truncated by output length limit" prompt — the model needs to know which class of failure it's recovering from. Same length_continue_retries=3 budget, truncated_response_parts merging, and final-response stitching infrastructure on both branches. Refs: NousResearch/hermes-agent#30963 --- agent/conversation_loop.py | 50 ++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py index f5fc0d12a31..212f7cc09ac 100644 --- a/agent/conversation_loop.py +++ b/agent/conversation_loop.py @@ -1414,7 +1414,18 @@ def run_conversation( finish_reason = "length" if finish_reason == "length": - agent._vprint(f"{agent.log_prefix}⚠️ Response truncated (finish_reason='length') - model hit max output tokens", force=True) + if getattr(response, "id", "") == "partial-stream-stub": + agent._vprint( + f"{agent.log_prefix}⚠️ Stream interrupted by network error " + f"(finish_reason='length' on partial-stream-stub)", + force=True, + ) + else: + agent._vprint( + f"{agent.log_prefix}⚠️ Response truncated " + f"(finish_reason='length') - model hit max output tokens", + force=True, + ) # Normalize the truncated response to a single OpenAI-style # message shape so text-continuation and tool-call retry @@ -1507,17 +1518,40 @@ def run_conversation( truncated_response_parts.append(assistant_message.content) if length_continue_retries < 3: - agent._vprint( - f"{agent.log_prefix}↻ Requesting continuation " - f"({length_continue_retries}/3)..." + # Distinguish a real output-token truncation + # from a partial-stream-stub network error + # (#30963). Same continuation machinery, + # but the prompt has to tell the truth or + # the model goes off rails ("I wasn't + # truncated, I'm done"). + _is_partial_stream_stub = ( + getattr(response, "id", "") == "partial-stream-stub" ) - continue_msg = { - "role": "user", - "content": ( + if _is_partial_stream_stub: + agent._vprint( + f"{agent.log_prefix}↻ Stream interrupted — " + f"requesting continuation " + f"({length_continue_retries}/3)..." + ) + _continue_content = ( + "[System: The previous response was cut off by a " + "network error mid-stream. Continue exactly where " + "you left off. Do not restart or repeat prior text. " + "Finish the answer directly.]" + ) + else: + agent._vprint( + f"{agent.log_prefix}↻ Requesting continuation " + f"({length_continue_retries}/3)..." + ) + _continue_content = ( "[System: Your previous response was truncated by the output " "length limit. Continue exactly where you left off. Do not " "restart or repeat prior text. Finish the answer directly.]" - ), + ) + continue_msg = { + "role": "user", + "content": _continue_content, } messages.append(continue_msg) agent._session_messages = messages