From 9140be7c228dcff02dc12d52a499eb06d6721496 Mon Sep 17 00:00:00 2001
From: xxxigm <tuancanhnguyen706@gmail.com>
Date: Sat, 23 May 2026 22:05:14 +0700
Subject: [PATCH] fix(streaming): emit finish_reason=length on text-only
 partial-stream stub
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the API connection drops mid-stream after text deltas have already
been delivered, chat_completion_helpers returned a stub response with
finish_reason=stop. The conversation loop then classified the stub as a
clean text completion (text_response(finish_reason=stop)) and exited
with iteration budget remaining — even when the goal-judge verdict
came back as "continue" milliseconds later (issue #30963).

Switch the text-only partial-stream stub to finish_reason=length. The
existing length-continuation path (length_continue_retries up to 3,
"continue exactly where you left off" prompt, partial parts merged
into final_response) then fires automatically: the partial assistant
content is persisted, the model is asked to continue from the cut
point, and the loop keeps making progress against the goal.

The mid-tool-call branch keeps finish_reason=stop on purpose — its
user-facing warning ("Ask me to retry if you want to continue") asks
the user to drive the retry rather than auto-replaying a tool call
with possible side effects.

#5544's "no duplicate message" contract is preserved verbatim: the
partial content is reused, never re-emitted as a fresh API call, so
the user never sees two copies of the same delta.

Refs: NousResearch/hermes-agent#30963
---
 agent/chat_completion_helpers.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/agent/chat_completion_helpers.py b/agent/chat_completion_helpers.py
index 602b923a894..b3261b60d0b 100644
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
@@ -2077,8 +2077,21 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
             # Streaming failed AFTER some tokens were already delivered to
             # the platform.  Re-raising would let the outer retry loop make
             # a new API call, creating a duplicate message.  Return a
-            # partial "stop" response instead so the outer loop treats this
-            # turn as complete (no retry, no fallback).
+            # partial response stub instead and let the outer loop decide:
+            #
+            #   - text-only partials → finish_reason="length" so the
+            #     conversation loop persists the partial assistant content
+            #     and asks the model to continue from where the stream
+            #     died (issue #30963: partial stop misclassified as a
+            #     clean completion was exiting the loop with budget
+            #     remaining and an unfinished goal).
+            #
+            #   - partial mid-tool-call → finish_reason="stop" stays.
+            #     The user-visible warning we append says "Ask me to
+            #     retry if you want to continue", so the agent should
+            #     hand control back rather than auto-retry a tool call
+            #     that may have side-effects.
+            #
             # Recover whatever content was already streamed to the user.
             # _current_streamed_assistant_text accumulates text fired
             # through _fire_stream_delta, so it has exactly what the
@@ -2116,14 +2129,17 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                     "of text; surfaced warning to user: %s",
                     _partial_names, len(_partial_text or ""), result["error"],
                 )
+                _stub_finish_reason = "stop"
             else:
                 logger.warning(
-                    "Partial stream delivered before error; returning stub "
-                    "response with %s chars of recovered content to prevent "
-                    "duplicate messages: %s",
+                    "Partial stream delivered before error; returning "
+                    "length-truncated stub with %s chars of recovered "
+                    "content so the loop can continue from where the "
+                    "stream died: %s",
                     len(_partial_text or ""),
                     result["error"],
                 )
+                _stub_finish_reason = "length"
             _stub_msg = SimpleNamespace(
                 role="assistant", content=_partial_text, tool_calls=None,
                 reasoning_content=None,
@@ -2132,7 +2148,7 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                 id="partial-stream-stub",
                 model=getattr(agent, "model", "unknown"),
                 choices=[SimpleNamespace(
-                    index=0, message=_stub_msg, finish_reason="stop",
+                    index=0, message=_stub_msg, finish_reason=_stub_finish_reason,
                 )],
                 usage=None,
             )