fix(streaming): emit finish_reason=length on text-only partial-stream stub

When the API connection drops mid-stream after text deltas have already been delivered, chat_completion_helpers returned a stub response with finish_reason=stop. The conversation loop then classified the stub as a clean text completion (text_response(finish_reason=stop)) and exited with iteration budget remaining — even when the goal-judge verdict came back as "continue" milliseconds later (issue #30963). Switch the text-only partial-stream stub to finish_reason=length. The existing length-continuation path (length_continue_retries up to 3, "continue exactly where you left off" prompt, partial parts merged into final_response) then fires automatically: the partial assistant content is persisted, the model is asked to continue from the cut point, and the loop keeps making progress against the goal. The mid-tool-call branch keeps finish_reason=stop on purpose — its user-facing warning ("Ask me to retry if you want to continue") asks the user to drive the retry rather than auto-replaying a tool call with possible side effects. #5544's "no duplicate message" contract is preserved verbatim: the partial content is reused, never re-emitted as a fresh API call, so the user never sees two copies of the same delta. Refs: NousResearch/hermes-agent#30963
2026-07-13 14:02:16 +00:00 · 2026-05-23 22:05:14 +07:00 · 2026-05-23 22:05:14 +07:00 · 9140be7c22
commit 9140be7c22
parent 60d20a37c9
1 changed files with 22 additions and 6 deletions
--- a/agent/chat_completion_helpers.py
+++ b/agent/chat_completion_helpers.py
@ -2077,8 +2077,21 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
            # Streaming failed AFTER some tokens were already delivered to
            # the platform.  Re-raising would let the outer retry loop make
            # a new API call, creating a duplicate message.  Return a
-            # partial "stop" response instead so the outer loop treats this
-            # turn as complete (no retry, no fallback).
+            # partial response stub instead and let the outer loop decide:
+            #
+            #   - text-only partials → finish_reason="length" so the
+            #     conversation loop persists the partial assistant content
+            #     and asks the model to continue from where the stream
+            #     died (issue #30963: partial stop misclassified as a
+            #     clean completion was exiting the loop with budget
+            #     remaining and an unfinished goal).
+            #
+            #   - partial mid-tool-call → finish_reason="stop" stays.
+            #     The user-visible warning we append says "Ask me to
+            #     retry if you want to continue", so the agent should
+            #     hand control back rather than auto-retry a tool call
+            #     that may have side-effects.
+            #
            # Recover whatever content was already streamed to the user.
            # _current_streamed_assistant_text accumulates text fired
            # through _fire_stream_delta, so it has exactly what the
@ -2116,14 +2129,17 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                    "of text; surfaced warning to user: %s",
                    _partial_names, len(_partial_text or ""), result["error"],
                )
+                _stub_finish_reason = "stop"
            else:
                logger.warning(
-                    "Partial stream delivered before error; returning stub "
-                    "response with %s chars of recovered content to prevent "
-                    "duplicate messages: %s",
+                    "Partial stream delivered before error; returning "
+                    "length-truncated stub with %s chars of recovered "
+                    "content so the loop can continue from where the "
+                    "stream died: %s",
                    len(_partial_text or ""),
                    result["error"],
                )
+                _stub_finish_reason = "length"
            _stub_msg = SimpleNamespace(
                role="assistant", content=_partial_text, tool_calls=None,
                reasoning_content=None,
@ -2132,7 +2148,7 @@ def interruptible_streaming_api_call(agent, api_kwargs: dict, *, on_first_delta=
                id="partial-stream-stub",
                model=getattr(agent, "model", "unknown"),
                choices=[SimpleNamespace(
-                    index=0, message=_stub_msg, finish_reason="stop",
+                    index=0, message=_stub_msg, finish_reason=_stub_finish_reason,
                )],
                usage=None,
            )