From 20b3703a42f238c6a3a717f6f9ec99301feca958 Mon Sep 17 00:00:00 2001
From: xxxigm <tuancanhnguyen706@gmail.com>
Date: Sat, 23 May 2026 22:05:55 +0700
Subject: [PATCH] fix(conversation-loop): tailor length-continuation prompt for
 partial stream
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The length-continue path's user-facing vprint and continuation prompt
both told the model "your response was truncated by the output length
limit." That's a lie when the stub came from a partial-stream network
error (issue #30963) — and a lie the model can detect, leading to "I
wasn't truncated, I'm done" no-op responses that defeat the
continuation entirely.

Detect the partial-stream-stub via response.id and swap in:

- vprint:   "Stream interrupted by network error
             (finish_reason='length' on partial-stream-stub)"
- prompt:   "[System: The previous response was cut off by a network
             error mid-stream. Continue exactly where you left off.
             Do not restart or repeat prior text. Finish the answer
             directly.]"

Real length truncations still see the original "truncated by output
length limit" prompt — the model needs to know which class of failure
it's recovering from. Same length_continue_retries=3 budget,
truncated_response_parts merging, and final-response stitching
infrastructure on both branches.

Refs: NousResearch/hermes-agent#30963
---
 agent/conversation_loop.py | 50 ++++++++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 8 deletions(-)

diff --git a/agent/conversation_loop.py b/agent/conversation_loop.py
index f5fc0d12a31..212f7cc09ac 100644
--- a/agent/conversation_loop.py
+++ b/agent/conversation_loop.py
@@ -1414,7 +1414,18 @@ def run_conversation(
                         finish_reason = "length"
 
                 if finish_reason == "length":
-                    agent._vprint(f"{agent.log_prefix}⚠️  Response truncated (finish_reason='length') - model hit max output tokens", force=True)
+                    if getattr(response, "id", "") == "partial-stream-stub":
+                        agent._vprint(
+                            f"{agent.log_prefix}⚠️  Stream interrupted by network error "
+                            f"(finish_reason='length' on partial-stream-stub)",
+                            force=True,
+                        )
+                    else:
+                        agent._vprint(
+                            f"{agent.log_prefix}⚠️  Response truncated "
+                            f"(finish_reason='length') - model hit max output tokens",
+                            force=True,
+                        )
 
                     # Normalize the truncated response to a single OpenAI-style
                     # message shape so text-continuation and tool-call retry
@@ -1507,17 +1518,40 @@ def run_conversation(
                                 truncated_response_parts.append(assistant_message.content)
 
                             if length_continue_retries < 3:
-                                agent._vprint(
-                                    f"{agent.log_prefix}↻ Requesting continuation "
-                                    f"({length_continue_retries}/3)..."
+                                # Distinguish a real output-token truncation
+                                # from a partial-stream-stub network error
+                                # (#30963).  Same continuation machinery,
+                                # but the prompt has to tell the truth or
+                                # the model goes off rails ("I wasn't
+                                # truncated, I'm done").
+                                _is_partial_stream_stub = (
+                                    getattr(response, "id", "") == "partial-stream-stub"
                                 )
-                                continue_msg = {
-                                    "role": "user",
-                                    "content": (
+                                if _is_partial_stream_stub:
+                                    agent._vprint(
+                                        f"{agent.log_prefix}↻ Stream interrupted — "
+                                        f"requesting continuation "
+                                        f"({length_continue_retries}/3)..."
+                                    )
+                                    _continue_content = (
+                                        "[System: The previous response was cut off by a "
+                                        "network error mid-stream. Continue exactly where "
+                                        "you left off. Do not restart or repeat prior text. "
+                                        "Finish the answer directly.]"
+                                    )
+                                else:
+                                    agent._vprint(
+                                        f"{agent.log_prefix}↻ Requesting continuation "
+                                        f"({length_continue_retries}/3)..."
+                                    )
+                                    _continue_content = (
                                         "[System: Your previous response was truncated by the output "
                                         "length limit. Continue exactly where you left off. Do not "
                                         "restart or repeat prior text. Finish the answer directly.]"
-                                    ),
+                                    )
+                                continue_msg = {
+                                    "role": "user",
+                                    "content": _continue_content,
                                 }
                                 messages.append(continue_msg)
                                 agent._session_messages = messages