mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-30 06:41:51 +00:00
* fix(streaming): route mid-tool-call partial-stream-stub through length continuation (#31998) When a stream stalls mid-tool-call (e.g. a large write_file), the partial-stream-stub recovery used finish_reason='stop' which caused the conversation loop to treat the turn as complete, returning only the warning text. When users said 'continue', the model retried the same large tool call, hit the same stale timeout, and looped indefinitely. Changes: - chat_completion_helpers.py: change _stub_finish_reason from 'stop' to 'length' for mid-tool-call partials. The stub still has tool_calls=None so no tool auto-executes — the model gets a fresh API call through the existing length-continuation machinery (bounded to 3 retries). Also attach _dropped_tool_names to the stub for downstream use. - conversation_loop.py: add a third continuation prompt branch for partial-stream-stubs with dropped tool calls. Instead of the generic 'continue where you left off' (which would retry the same large call), tell the model to break the output into smaller tool calls (~8K tokens each) to avoid stream timeouts. - test_partial_stream_finish_reason.py: update existing test from finish_reason='stop' to 'length', add _dropped_tool_names assertion, add new test_dropped_tool_call_uses_chunking_prompt for the 3-way prompt branching. Safety: tool_calls=None is preserved on the stub, so the conversation loop enters the text-continuation branch (line 1513), NOT the tool-call execution branch (line 3246). No tool auto-executes. The model simply gets another API call with targeted guidance. * refactor: extract constants and continuation prompt helper - Move magic strings to hermes_constants.py (PARTIAL_STREAM_STUB_ID, FINISH_REASON_LENGTH) - Extract _get_continuation_prompt() in conversation_loop.py — DRYs the 3-way prompt branching and lets tests import the real function - Trim verbose inline comments in chat_completion_helpers.py - Tests import constants + helper instead of duplicating logic --------- Co-authored-by: alt-glitch <balyan.sid@gmail.com>
269 lines
12 KiB
Python
269 lines
12 KiB
Python
"""Regression tests for issue #30963 — partial-stream stub finish_reason.
|
|
|
|
Pins the contract:
|
|
|
|
- text-only partial stream → stub.finish_reason == "length" so the
|
|
conversation loop's existing length-continuation path can keep the
|
|
agent moving against an unfinished goal.
|
|
- partial mid-tool-call → stub.finish_reason == "length" so the loop
|
|
triggers continuation machinery with targeted chunking guidance
|
|
instead of ending the turn immediately.
|
|
- conversation_loop's length-continuation prompt distinguishes a real
|
|
output-length truncation from a partial-stream-stub network error
|
|
via response.id.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from types import SimpleNamespace
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from hermes_constants import PARTIAL_STREAM_STUB_ID, FINISH_REASON_LENGTH
|
|
from agent.conversation_loop import _get_continuation_prompt
|
|
|
|
|
|
# ── Helpers (mirrors test_streaming.py) ────────────────────────────────────
|
|
|
|
def _make_stream_chunk(content=None, tool_calls=None, finish_reason=None):
|
|
delta = SimpleNamespace(
|
|
content=content, tool_calls=tool_calls,
|
|
reasoning_content=None, reasoning=None,
|
|
)
|
|
choice = SimpleNamespace(index=0, delta=delta, finish_reason=finish_reason)
|
|
return SimpleNamespace(choices=[choice], model=None, usage=None)
|
|
|
|
|
|
def _make_tool_call_delta(index=0, tc_id=None, name=None, arguments=None):
|
|
func = SimpleNamespace(name=name, arguments=arguments)
|
|
return SimpleNamespace(index=index, id=tc_id, function=func)
|
|
|
|
|
|
def _make_agent():
|
|
from run_agent import AIAgent
|
|
agent = AIAgent(
|
|
api_key="test-key",
|
|
base_url="https://example.com/v1",
|
|
model="test/model",
|
|
quiet_mode=True,
|
|
skip_context_files=True,
|
|
skip_memory=True,
|
|
)
|
|
agent.api_mode = "chat_completions"
|
|
agent._interrupt_requested = False
|
|
return agent
|
|
|
|
|
|
# ── Stub finish_reason ────────────────────────────────────────────────────
|
|
|
|
class TestPartialStreamStubFinishReason:
|
|
"""The stub returned by interruptible_streaming_api_call when the
|
|
upstream connection dies mid-flight."""
|
|
|
|
@patch("run_agent.AIAgent._create_request_openai_client")
|
|
@patch("run_agent.AIAgent._close_request_openai_client")
|
|
def test_text_only_partial_returns_length(self, _mock_close, mock_create, monkeypatch):
|
|
"""#30963: text-only partials must classify as length so the loop
|
|
keeps continuing instead of exiting with budget remaining."""
|
|
|
|
def _stalling_stream():
|
|
yield _make_stream_chunk(content="Here's my answer so far")
|
|
raise RuntimeError("simulated upstream stall")
|
|
|
|
mock_client = MagicMock()
|
|
mock_client.chat.completions.create.side_effect = lambda *a, **kw: _stalling_stream()
|
|
mock_create.return_value = mock_client
|
|
|
|
agent = _make_agent()
|
|
agent._current_streamed_assistant_text = "Here's my answer so far"
|
|
|
|
monkeypatch.setenv("HERMES_STREAM_RETRIES", "0")
|
|
response = agent._interruptible_streaming_api_call({})
|
|
|
|
assert response.id == PARTIAL_STREAM_STUB_ID
|
|
assert response.choices[0].finish_reason == FINISH_REASON_LENGTH, (
|
|
"Text-only partial streams must use finish_reason=length so the "
|
|
"conversation loop continues from where the network died "
|
|
"(issue #30963)."
|
|
)
|
|
assert response.choices[0].message.content == "Here's my answer so far"
|
|
assert response.choices[0].message.tool_calls is None
|
|
|
|
@patch("run_agent.AIAgent._create_request_openai_client")
|
|
@patch("run_agent.AIAgent._close_request_openai_client")
|
|
def test_partial_tool_call_uses_length(self, _mock_close, mock_create, monkeypatch):
|
|
"""Mid-tool-call partials now use finish_reason=length so the
|
|
conversation loop's continuation machinery fires — bounded 3-retry
|
|
with guidance to break output into smaller chunks (#31998).
|
|
tool_calls=None is preserved, so no tool auto-executes."""
|
|
|
|
def _stalling_stream():
|
|
yield _make_stream_chunk(content="Let me write the audit: ")
|
|
yield _make_stream_chunk(tool_calls=[
|
|
_make_tool_call_delta(index=0, tc_id="call_1", name="write_file"),
|
|
])
|
|
yield _make_stream_chunk(tool_calls=[
|
|
_make_tool_call_delta(index=0, arguments='{"path": "/tmp/x", '),
|
|
])
|
|
raise RuntimeError("simulated upstream stall")
|
|
|
|
mock_client = MagicMock()
|
|
mock_client.chat.completions.create.side_effect = lambda *a, **kw: _stalling_stream()
|
|
mock_create.return_value = mock_client
|
|
|
|
agent = _make_agent()
|
|
agent._fire_stream_delta = lambda text: None
|
|
agent._current_streamed_assistant_text = "Let me write the audit: "
|
|
|
|
monkeypatch.setenv("HERMES_STREAM_RETRIES", "0")
|
|
response = agent._interruptible_streaming_api_call({})
|
|
|
|
assert response.id == PARTIAL_STREAM_STUB_ID
|
|
assert response.choices[0].finish_reason == FINISH_REASON_LENGTH, (
|
|
"Partial mid-tool-call must use finish_reason=length so the "
|
|
"continuation machinery fires instead of ending the turn "
|
|
"immediately (#31998)."
|
|
)
|
|
assert response.choices[0].message.tool_calls is None, (
|
|
"tool_calls must remain None (no auto-execution of side-effectful "
|
|
"tool calls)."
|
|
)
|
|
# The stub should carry dropped tool names for continuation prompt
|
|
assert getattr(response, "_dropped_tool_names", None) == ["write_file"]
|
|
content = response.choices[0].message.content or ""
|
|
assert "Stream stalled mid tool-call" in content
|
|
assert "write_file" in content
|
|
|
|
|
|
# ── Length-continuation prompt branching ──────────────────────────────────
|
|
|
|
class TestLengthContinuationPromptBranching:
|
|
"""When finish_reason=length, the continuation prompt that reaches the
|
|
model has to tell the truth: real truncation vs. network interruption
|
|
vs. dropped tool call (#31998). Three distinct prompts now exist."""
|
|
|
|
def _simulate_branch(self, response_id: str, dropped_tools=None) -> str:
|
|
"""Return the continuation prompt text the loop would inject for
|
|
a `finish_reason=length` response with the given id."""
|
|
is_partial = response_id == PARTIAL_STREAM_STUB_ID
|
|
return _get_continuation_prompt(is_partial, dropped_tools)
|
|
|
|
def test_partial_stream_stub_uses_network_prompt(self):
|
|
prompt = self._simulate_branch(PARTIAL_STREAM_STUB_ID)
|
|
assert "network error mid-stream" in prompt
|
|
assert "output length limit" not in prompt
|
|
|
|
def test_real_truncation_uses_length_prompt(self):
|
|
prompt = self._simulate_branch("chatcmpl-abc123")
|
|
assert "output length limit" in prompt
|
|
assert "network error" not in prompt
|
|
|
|
def test_no_id_falls_through_to_length_prompt(self):
|
|
prompt = self._simulate_branch("")
|
|
assert "output length limit" in prompt
|
|
|
|
def test_dropped_tool_call_uses_chunking_prompt(self):
|
|
"""When the stub dropped a tool call, the continuation prompt
|
|
must guide the model to break its output into smaller chunks
|
|
instead of retrying the same large tool call (#31998)."""
|
|
prompt = self._simulate_branch(
|
|
PARTIAL_STREAM_STUB_ID, dropped_tools=["write_file"],
|
|
)
|
|
assert "too large" in prompt
|
|
assert "break" in prompt.lower()
|
|
assert "write_file" in prompt
|
|
assert "network error" not in prompt
|
|
assert "output length limit" not in prompt
|
|
|
|
|
|
# ── Integration: live conversation loop ───────────────────────────────────
|
|
|
|
@pytest.fixture()
|
|
def loop_agent():
|
|
"""AIAgent with a mocked OpenAI client (mirrors test_run_agent's fixture)
|
|
so we can stage a stub + continuation pair on .chat.completions.create."""
|
|
from run_agent import AIAgent
|
|
with (
|
|
patch("run_agent.get_tool_definitions", return_value=[]),
|
|
patch("run_agent.check_toolset_requirements", return_value={}),
|
|
patch("run_agent.OpenAI"),
|
|
):
|
|
a = AIAgent(
|
|
api_key="test-key-1234567890",
|
|
base_url="https://openrouter.ai/api/v1",
|
|
quiet_mode=True,
|
|
skip_context_files=True,
|
|
skip_memory=True,
|
|
)
|
|
a.client = MagicMock()
|
|
a._cached_system_prompt = "You are helpful."
|
|
a._use_prompt_caching = False
|
|
a.tool_delay = 0
|
|
a.compression_enabled = False
|
|
a.save_trajectories = False
|
|
return a
|
|
|
|
|
|
class TestConversationLoopPartialStreamContinuation:
|
|
"""End-to-end: a partial-stream stub feeds the loop and the loop
|
|
asks for continuation instead of exiting with finish_reason=stop."""
|
|
|
|
def test_partial_stream_stub_does_not_exit_loop_immediately(self, loop_agent):
|
|
"""The stub from chat_completion_helpers used to exit the loop with
|
|
text_response(finish_reason=stop). Now finish_reason=length routes
|
|
through length_continue_retries — the loop persists the partial
|
|
content and asks the model to continue."""
|
|
|
|
from tests.run_agent.test_run_agent import _mock_response, _mock_assistant_msg
|
|
|
|
# First API call: the partial-stream stub (length on partial-stream-stub id).
|
|
partial_stub = SimpleNamespace(
|
|
id=PARTIAL_STREAM_STUB_ID,
|
|
model="test/model",
|
|
choices=[SimpleNamespace(
|
|
index=0,
|
|
message=_mock_assistant_msg(content="The first half of "),
|
|
finish_reason=FINISH_REASON_LENGTH,
|
|
)],
|
|
usage=None,
|
|
)
|
|
# Second API call: model continues with the rest, clean stop.
|
|
continuation = _mock_response(
|
|
content="the answer is forty-two.", finish_reason="stop",
|
|
)
|
|
|
|
loop_agent.client.chat.completions.create.side_effect = [
|
|
partial_stub, continuation,
|
|
]
|
|
|
|
with (
|
|
patch.object(loop_agent, "_persist_session"),
|
|
patch.object(loop_agent, "_save_trajectory"),
|
|
patch.object(loop_agent, "_cleanup_task_resources"),
|
|
):
|
|
result = loop_agent.run_conversation("ask me something")
|
|
|
|
# The loop made TWO API calls (stub + continuation), not one.
|
|
assert loop_agent.client.chat.completions.create.call_count == 2, (
|
|
"Partial-stream-stub must trigger a continuation API call, not "
|
|
"exit the loop after one call."
|
|
)
|
|
# The continuation prompt the loop appended must be the network-error
|
|
# variant, not the "output length limit" lie — otherwise the model
|
|
# no-ops with "I wasn't truncated, I'm done."
|
|
# We assert it indirectly by inspecting the second-call kwargs.
|
|
second_call_kwargs = loop_agent.client.chat.completions.create.call_args_list[1]
|
|
msgs = second_call_kwargs.kwargs.get("messages") or second_call_kwargs.args[0].get("messages")
|
|
last_user = next(
|
|
(m for m in reversed(msgs) if m.get("role") == "user"), None,
|
|
)
|
|
assert last_user is not None
|
|
assert "network error mid-stream" in (last_user.get("content") or ""), (
|
|
"Continuation prompt for partial-stream-stub must mention the "
|
|
"network error, not the 'output length limit'."
|
|
)
|
|
|
|
# And the final response stitches both halves together.
|
|
assert "first half of" in result["final_response"]
|
|
assert "forty-two" in result["final_response"]
|