hermes-agent/tests/run_agent/test_partial_stream_finish_reason.py
daimon-nous[bot] ac5359a3f3
fix(streaming): route mid-tool-call partial-stream-stub through length continuation (#31998) (#32012)
* fix(streaming): route mid-tool-call partial-stream-stub through length continuation (#31998)

When a stream stalls mid-tool-call (e.g. a large write_file), the
partial-stream-stub recovery used finish_reason='stop' which caused the
conversation loop to treat the turn as complete, returning only the
warning text. When users said 'continue', the model retried the same
large tool call, hit the same stale timeout, and looped indefinitely.

Changes:
- chat_completion_helpers.py: change _stub_finish_reason from 'stop' to
  'length' for mid-tool-call partials. The stub still has tool_calls=None
  so no tool auto-executes — the model gets a fresh API call through the
  existing length-continuation machinery (bounded to 3 retries).
  Also attach _dropped_tool_names to the stub for downstream use.
- conversation_loop.py: add a third continuation prompt branch for
  partial-stream-stubs with dropped tool calls. Instead of the generic
  'continue where you left off' (which would retry the same large call),
  tell the model to break the output into smaller tool calls (~8K
  tokens each) to avoid stream timeouts.
- test_partial_stream_finish_reason.py: update existing test from
  finish_reason='stop' to 'length', add _dropped_tool_names assertion,
  add new test_dropped_tool_call_uses_chunking_prompt for the 3-way
  prompt branching.

Safety: tool_calls=None is preserved on the stub, so the conversation
loop enters the text-continuation branch (line 1513), NOT the tool-call
execution branch (line 3246). No tool auto-executes. The model simply
gets another API call with targeted guidance.

* refactor: extract constants and continuation prompt helper

- Move magic strings to hermes_constants.py (PARTIAL_STREAM_STUB_ID,
  FINISH_REASON_LENGTH)
- Extract _get_continuation_prompt() in conversation_loop.py — DRYs the
  3-way prompt branching and lets tests import the real function
- Trim verbose inline comments in chat_completion_helpers.py
- Tests import constants + helper instead of duplicating logic

---------

Co-authored-by: alt-glitch <balyan.sid@gmail.com>
2026-05-25 17:43:10 +05:30

269 lines
12 KiB
Python

"""Regression tests for issue #30963 — partial-stream stub finish_reason.
Pins the contract:
- text-only partial stream → stub.finish_reason == "length" so the
conversation loop's existing length-continuation path can keep the
agent moving against an unfinished goal.
- partial mid-tool-call → stub.finish_reason == "length" so the loop
triggers continuation machinery with targeted chunking guidance
instead of ending the turn immediately.
- conversation_loop's length-continuation prompt distinguishes a real
output-length truncation from a partial-stream-stub network error
via response.id.
"""
from __future__ import annotations
from types import SimpleNamespace
from unittest.mock import MagicMock, patch
import pytest
from hermes_constants import PARTIAL_STREAM_STUB_ID, FINISH_REASON_LENGTH
from agent.conversation_loop import _get_continuation_prompt
# ── Helpers (mirrors test_streaming.py) ────────────────────────────────────
def _make_stream_chunk(content=None, tool_calls=None, finish_reason=None):
delta = SimpleNamespace(
content=content, tool_calls=tool_calls,
reasoning_content=None, reasoning=None,
)
choice = SimpleNamespace(index=0, delta=delta, finish_reason=finish_reason)
return SimpleNamespace(choices=[choice], model=None, usage=None)
def _make_tool_call_delta(index=0, tc_id=None, name=None, arguments=None):
func = SimpleNamespace(name=name, arguments=arguments)
return SimpleNamespace(index=index, id=tc_id, function=func)
def _make_agent():
from run_agent import AIAgent
agent = AIAgent(
api_key="test-key",
base_url="https://example.com/v1",
model="test/model",
quiet_mode=True,
skip_context_files=True,
skip_memory=True,
)
agent.api_mode = "chat_completions"
agent._interrupt_requested = False
return agent
# ── Stub finish_reason ────────────────────────────────────────────────────
class TestPartialStreamStubFinishReason:
"""The stub returned by interruptible_streaming_api_call when the
upstream connection dies mid-flight."""
@patch("run_agent.AIAgent._create_request_openai_client")
@patch("run_agent.AIAgent._close_request_openai_client")
def test_text_only_partial_returns_length(self, _mock_close, mock_create, monkeypatch):
"""#30963: text-only partials must classify as length so the loop
keeps continuing instead of exiting with budget remaining."""
def _stalling_stream():
yield _make_stream_chunk(content="Here's my answer so far")
raise RuntimeError("simulated upstream stall")
mock_client = MagicMock()
mock_client.chat.completions.create.side_effect = lambda *a, **kw: _stalling_stream()
mock_create.return_value = mock_client
agent = _make_agent()
agent._current_streamed_assistant_text = "Here's my answer so far"
monkeypatch.setenv("HERMES_STREAM_RETRIES", "0")
response = agent._interruptible_streaming_api_call({})
assert response.id == PARTIAL_STREAM_STUB_ID
assert response.choices[0].finish_reason == FINISH_REASON_LENGTH, (
"Text-only partial streams must use finish_reason=length so the "
"conversation loop continues from where the network died "
"(issue #30963)."
)
assert response.choices[0].message.content == "Here's my answer so far"
assert response.choices[0].message.tool_calls is None
@patch("run_agent.AIAgent._create_request_openai_client")
@patch("run_agent.AIAgent._close_request_openai_client")
def test_partial_tool_call_uses_length(self, _mock_close, mock_create, monkeypatch):
"""Mid-tool-call partials now use finish_reason=length so the
conversation loop's continuation machinery fires — bounded 3-retry
with guidance to break output into smaller chunks (#31998).
tool_calls=None is preserved, so no tool auto-executes."""
def _stalling_stream():
yield _make_stream_chunk(content="Let me write the audit: ")
yield _make_stream_chunk(tool_calls=[
_make_tool_call_delta(index=0, tc_id="call_1", name="write_file"),
])
yield _make_stream_chunk(tool_calls=[
_make_tool_call_delta(index=0, arguments='{"path": "/tmp/x", '),
])
raise RuntimeError("simulated upstream stall")
mock_client = MagicMock()
mock_client.chat.completions.create.side_effect = lambda *a, **kw: _stalling_stream()
mock_create.return_value = mock_client
agent = _make_agent()
agent._fire_stream_delta = lambda text: None
agent._current_streamed_assistant_text = "Let me write the audit: "
monkeypatch.setenv("HERMES_STREAM_RETRIES", "0")
response = agent._interruptible_streaming_api_call({})
assert response.id == PARTIAL_STREAM_STUB_ID
assert response.choices[0].finish_reason == FINISH_REASON_LENGTH, (
"Partial mid-tool-call must use finish_reason=length so the "
"continuation machinery fires instead of ending the turn "
"immediately (#31998)."
)
assert response.choices[0].message.tool_calls is None, (
"tool_calls must remain None (no auto-execution of side-effectful "
"tool calls)."
)
# The stub should carry dropped tool names for continuation prompt
assert getattr(response, "_dropped_tool_names", None) == ["write_file"]
content = response.choices[0].message.content or ""
assert "Stream stalled mid tool-call" in content
assert "write_file" in content
# ── Length-continuation prompt branching ──────────────────────────────────
class TestLengthContinuationPromptBranching:
"""When finish_reason=length, the continuation prompt that reaches the
model has to tell the truth: real truncation vs. network interruption
vs. dropped tool call (#31998). Three distinct prompts now exist."""
def _simulate_branch(self, response_id: str, dropped_tools=None) -> str:
"""Return the continuation prompt text the loop would inject for
a `finish_reason=length` response with the given id."""
is_partial = response_id == PARTIAL_STREAM_STUB_ID
return _get_continuation_prompt(is_partial, dropped_tools)
def test_partial_stream_stub_uses_network_prompt(self):
prompt = self._simulate_branch(PARTIAL_STREAM_STUB_ID)
assert "network error mid-stream" in prompt
assert "output length limit" not in prompt
def test_real_truncation_uses_length_prompt(self):
prompt = self._simulate_branch("chatcmpl-abc123")
assert "output length limit" in prompt
assert "network error" not in prompt
def test_no_id_falls_through_to_length_prompt(self):
prompt = self._simulate_branch("")
assert "output length limit" in prompt
def test_dropped_tool_call_uses_chunking_prompt(self):
"""When the stub dropped a tool call, the continuation prompt
must guide the model to break its output into smaller chunks
instead of retrying the same large tool call (#31998)."""
prompt = self._simulate_branch(
PARTIAL_STREAM_STUB_ID, dropped_tools=["write_file"],
)
assert "too large" in prompt
assert "break" in prompt.lower()
assert "write_file" in prompt
assert "network error" not in prompt
assert "output length limit" not in prompt
# ── Integration: live conversation loop ───────────────────────────────────
@pytest.fixture()
def loop_agent():
"""AIAgent with a mocked OpenAI client (mirrors test_run_agent's fixture)
so we can stage a stub + continuation pair on .chat.completions.create."""
from run_agent import AIAgent
with (
patch("run_agent.get_tool_definitions", return_value=[]),
patch("run_agent.check_toolset_requirements", return_value={}),
patch("run_agent.OpenAI"),
):
a = AIAgent(
api_key="test-key-1234567890",
base_url="https://openrouter.ai/api/v1",
quiet_mode=True,
skip_context_files=True,
skip_memory=True,
)
a.client = MagicMock()
a._cached_system_prompt = "You are helpful."
a._use_prompt_caching = False
a.tool_delay = 0
a.compression_enabled = False
a.save_trajectories = False
return a
class TestConversationLoopPartialStreamContinuation:
"""End-to-end: a partial-stream stub feeds the loop and the loop
asks for continuation instead of exiting with finish_reason=stop."""
def test_partial_stream_stub_does_not_exit_loop_immediately(self, loop_agent):
"""The stub from chat_completion_helpers used to exit the loop with
text_response(finish_reason=stop). Now finish_reason=length routes
through length_continue_retries — the loop persists the partial
content and asks the model to continue."""
from tests.run_agent.test_run_agent import _mock_response, _mock_assistant_msg
# First API call: the partial-stream stub (length on partial-stream-stub id).
partial_stub = SimpleNamespace(
id=PARTIAL_STREAM_STUB_ID,
model="test/model",
choices=[SimpleNamespace(
index=0,
message=_mock_assistant_msg(content="The first half of "),
finish_reason=FINISH_REASON_LENGTH,
)],
usage=None,
)
# Second API call: model continues with the rest, clean stop.
continuation = _mock_response(
content="the answer is forty-two.", finish_reason="stop",
)
loop_agent.client.chat.completions.create.side_effect = [
partial_stub, continuation,
]
with (
patch.object(loop_agent, "_persist_session"),
patch.object(loop_agent, "_save_trajectory"),
patch.object(loop_agent, "_cleanup_task_resources"),
):
result = loop_agent.run_conversation("ask me something")
# The loop made TWO API calls (stub + continuation), not one.
assert loop_agent.client.chat.completions.create.call_count == 2, (
"Partial-stream-stub must trigger a continuation API call, not "
"exit the loop after one call."
)
# The continuation prompt the loop appended must be the network-error
# variant, not the "output length limit" lie — otherwise the model
# no-ops with "I wasn't truncated, I'm done."
# We assert it indirectly by inspecting the second-call kwargs.
second_call_kwargs = loop_agent.client.chat.completions.create.call_args_list[1]
msgs = second_call_kwargs.kwargs.get("messages") or second_call_kwargs.args[0].get("messages")
last_user = next(
(m for m in reversed(msgs) if m.get("role") == "user"), None,
)
assert last_user is not None
assert "network error mid-stream" in (last_user.get("content") or ""), (
"Continuation prompt for partial-stream-stub must mention the "
"network error, not the 'output length limit'."
)
# And the final response stitches both halves together.
assert "first half of" in result["final_response"]
assert "forty-two" in result["final_response"]