fix(api-server): collapse tool start/lifecycle into a single SSE event

Address Copilot review on PR #16666:

1. **Duplicate event on every tool start** — both ``tool_progress_callback``
   and ``tool_start_callback`` fire side-by-side in ``run_agent.py``, so
   wiring both into chat completions emitted *two* ``hermes.tool.progress``
   events per real tool call. Drop the legacy ``_on_tool_progress`` emit
   entirely; ``_on_tool_start`` now produces a single unified event that
   carries the legacy ``tool``/``emoji``/``label`` fields plus the new
   ``toolCallId``/``status`` correlation fields. Label is computed inline
   via ``build_tool_preview`` so callers do not need to pre-format it.

2. **Weak per-event correlation in the regression test** — the previous
   assertion checked that a ``toolCallId`` appeared *somewhere* in the
   aggregate, which would have passed even if ``running`` lacked the id.
   Collect ``(status, toolCallId)`` per event and assert each event
   carries the correct pair, plus exactly two events on the wire (no
   silent duplication regression).

The two existing chat-completions tool-progress tests are updated to fire
``tool_start_callback`` instead of ``tool_progress_callback``, matching
production reality where ``run_agent`` always pairs them.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
briandevans 2026-04-27 11:26:36 -07:00 committed by Teknium
parent 13c238327e
commit e0a03f3f40
2 changed files with 197 additions and 37 deletions

View file

@ -688,17 +688,17 @@ class TestChatCompletionsEndpoint:
@pytest.mark.asyncio
async def test_stream_includes_tool_progress(self, adapter):
"""tool_progress_callback fires → progress appears as custom SSE event, not in delta.content."""
"""tool_start_callback fires → progress appears as custom SSE event, not in delta.content."""
import asyncio
app = _create_app(adapter)
async with TestClient(TestServer(app)) as cli:
async def _mock_run_agent(**kwargs):
cb = kwargs.get("stream_delta_callback")
tp_cb = kwargs.get("tool_progress_callback")
# Simulate tool progress before streaming content
if tp_cb:
tp_cb("tool.started", "terminal", "ls -la", {"command": "ls -la"})
ts_cb = kwargs.get("tool_start_callback")
# Simulate the structured tool start the gateway now consumes.
if ts_cb:
ts_cb("call_terminal_1", "terminal", {"command": "ls -la"})
if cb:
await asyncio.sleep(0.05)
cb("Here are the files.")
@ -724,7 +724,10 @@ class TestChatCompletionsEndpoint:
# markers instead of calling tools (#6972).
assert "event: hermes.tool.progress" in body
assert '"tool": "terminal"' in body
assert '"label": "ls -la"' in body
# ``label`` is now derived by ``build_tool_preview`` from the
# tool args rather than passed by the caller, so we assert
# only that *some* label exists rather than a literal value.
assert '"label":' in body
# The progress marker must NOT appear inside any
# chat.completion.chunk delta.content field.
import json as _json
@ -744,17 +747,17 @@ class TestChatCompletionsEndpoint:
@pytest.mark.asyncio
async def test_stream_tool_progress_skips_internal_events(self, adapter):
"""Internal events (name starting with _) are not streamed."""
"""Internal tool calls (name starting with ``_``) are not streamed."""
import asyncio
app = _create_app(adapter)
async with TestClient(TestServer(app)) as cli:
async def _mock_run_agent(**kwargs):
cb = kwargs.get("stream_delta_callback")
tp_cb = kwargs.get("tool_progress_callback")
if tp_cb:
tp_cb("tool.started", "_thinking", "some internal state", {})
tp_cb("tool.started", "web_search", "Python docs", {"query": "Python docs"})
ts_cb = kwargs.get("tool_start_callback")
if ts_cb:
ts_cb("call_internal_1", "_thinking", {"text": "some internal state"})
ts_cb("call_search_1", "web_search", {"query": "Python docs"})
if cb:
await asyncio.sleep(0.05)
cb("Found it.")
@ -776,10 +779,142 @@ class TestChatCompletionsEndpoint:
body = await resp.text()
# Internal _thinking event should NOT appear anywhere
assert "some internal state" not in body
assert "call_internal_1" not in body
# Real tool progress should appear as custom SSE event
assert "event: hermes.tool.progress" in body
assert '"tool": "web_search"' in body
assert '"label": "Python docs"' in body
# Label is derived from the args dict by build_tool_preview;
# asserting on the structural fact (label exists, call id
# is correlated) rather than a literal preview string keeps
# the test robust against preview-formatter tweaks.
assert '"label":' in body
assert '"toolCallId": "call_search_1"' in body
@pytest.mark.asyncio
async def test_stream_emits_tool_lifecycle_with_call_id(self, adapter):
"""Regression for #16588.
``/v1/chat/completions`` streaming previously emitted only a
``tool.started``-style ``hermes.tool.progress`` event; clients
rendering tool lifecycle UI had no way to mark a tool as finished
because no matching ``status: completed`` event was emitted, and
no ``toolCallId`` was carried for correlation.
The fix adds ``tool_start_callback`` / ``tool_complete_callback``
to the chat completions agent invocation and writes both halves
of the lifecycle pair on the same ``event: hermes.tool.progress``
SSE line, with stable ``toolCallId`` and ``status``.
"""
import asyncio
import json as _json
app = _create_app(adapter)
async with TestClient(TestServer(app)) as cli:
async def _mock_run_agent(**kwargs):
cb = kwargs.get("stream_delta_callback")
ts_cb = kwargs.get("tool_start_callback")
tc_cb = kwargs.get("tool_complete_callback")
# The structured callbacks own the chat-completions SSE
# channel now; ``tool_progress_callback`` is intentionally
# not wired so each tool start emits exactly one event.
if ts_cb:
ts_cb("call_terminal_1", "terminal", {"command": "ls -la"})
if tc_cb:
tc_cb("call_terminal_1", "terminal", {"command": "ls -la"}, "ok")
if cb:
await asyncio.sleep(0.05)
cb("done.")
return (
{"final_response": "done.", "messages": [], "api_calls": 1},
{"input_tokens": 1, "output_tokens": 1, "total_tokens": 2},
)
with patch.object(adapter, "_run_agent", side_effect=_mock_run_agent):
resp = await cli.post(
"/v1/chat/completions",
json={
"model": "test",
"messages": [{"role": "user", "content": "list"}],
"stream": True,
},
)
assert resp.status == 200
body = await resp.text()
# Walk the SSE body and collect *(status, toolCallId)* pairs
# per event so the assertions verify per-event correlation —
# an event missing ``toolCallId`` would not pass even if a
# different event happens to carry the right id.
pairs: list[tuple[str | None, str | None]] = []
lines = body.splitlines()
for i, line in enumerate(lines):
if line.strip() != "event: hermes.tool.progress":
continue
for follow in lines[i + 1: i + 4]:
if follow.startswith("data: "):
try:
payload = _json.loads(follow[len("data: "):])
except _json.JSONDecodeError:
break
pairs.append((payload.get("status"), payload.get("toolCallId")))
break
# Each tool start must emit exactly one event (no duplicate
# legacy + new emit), and each lifecycle pair must carry the
# same toolCallId on every event — not just somewhere in the
# aggregate.
assert len(pairs) == 2, f"expected 2 events (running+completed), got {pairs}"
assert pairs[0] == ("running", "call_terminal_1"), pairs
assert pairs[1] == ("completed", "call_terminal_1"), pairs
@pytest.mark.asyncio
async def test_stream_tool_lifecycle_skips_internal_and_orphan_completes(self, adapter):
"""Internal tools (``_thinking``-style) and ``completed`` events
without a prior matching ``running`` must produce no lifecycle
events on the wire otherwise clients would see orphaned
``status: completed`` updates they cannot correlate."""
import asyncio
app = _create_app(adapter)
async with TestClient(TestServer(app)) as cli:
async def _mock_run_agent(**kwargs):
cb = kwargs.get("stream_delta_callback")
ts_cb = kwargs.get("tool_start_callback")
tc_cb = kwargs.get("tool_complete_callback")
# Internal tool — must be filtered.
if ts_cb:
ts_cb("call_internal_1", "_thinking", {})
if tc_cb:
tc_cb("call_internal_1", "_thinking", {}, "")
# Completion without start — orphan, must be dropped.
if tc_cb:
tc_cb("call_orphan_1", "web_search", {}, "ok")
if cb:
await asyncio.sleep(0.05)
cb("ok.")
return (
{"final_response": "ok.", "messages": [], "api_calls": 1},
{"input_tokens": 1, "output_tokens": 1, "total_tokens": 2},
)
with patch.object(adapter, "_run_agent", side_effect=_mock_run_agent):
resp = await cli.post(
"/v1/chat/completions",
json={
"model": "test",
"messages": [{"role": "user", "content": "ok"}],
"stream": True,
},
)
assert resp.status == 200
body = await resp.text()
# Neither the internal call_id nor the orphan call_id should
# surface as a lifecycle payload on the wire.
assert "call_internal_1" not in body
assert "call_orphan_1" not in body
assert '"status": "running"' not in body
assert '"status": "completed"' not in body
@pytest.mark.asyncio
async def test_no_user_message_returns_400(self, adapter):