fix(gateway): break the restart loop at the source on session resume

When a tool call itself restarts the gateway (docker restart, systemctl
restart, and similar), the process is terminated mid-call — before the
tool result is persisted and before the orderly drain rewind can run. The
transcript tail is left as an assistant(tool_calls) with no matching tool
answer. On resume the model re-issues the unanswered call, taking the
gateway down again — an infinite loop (#49201).

Source fix: _build_gateway_agent_history now strips a trailing
assistant(tool_calls) block that has no tool answers
(_strip_dangling_tool_call_tail), so there is nothing for the model to
re-execute. This complements _strip_interrupted_tool_tails, which only
handles the case where a tool result row exists with an interrupt marker.

Cognitive backstop: the resume-pending system note now states that any
restart command in the history already ran and must not be re-executed or
verified, and the empty-message auto-resume startup turn reports recovery
and asks for instructions instead of the nonsensical "address the user's
NEW message" (there is no new message on that turn).

Reimplements the intent of #49243 by @JoaoMarcos44 at the replay layer.

Fixes #49201
This commit is contained in:
joaomarcos 2026-06-19 16:26:23 -07:00 committed by Teknium
parent 6504f51cd5
commit 75ed07ace8
3 changed files with 208 additions and 12 deletions

View file

@ -805,6 +805,13 @@ def _build_gateway_agent_history(
# tools that were killed mid-flight.
agent_history = _strip_interrupted_tool_tails(agent_history)
# Strip a dangling assistant(tool_calls) tail with no tool answers —
# the signature of a SIGKILL mid-tool-call (e.g. the tool itself ran
# `docker restart`/`kill` and took the gateway down before the result
# was persisted). Without this the model re-issues the unanswered call
# on resume and loops the restart forever (#49201).
agent_history = _strip_dangling_tool_call_tail(agent_history)
observed_context = "\n".join(observed_group_context).strip() or None
return agent_history, observed_context
@ -930,6 +937,50 @@ def _strip_interrupted_tool_tails(
return cleaned
def _strip_dangling_tool_call_tail(
agent_history: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""Strip a trailing ``assistant(tool_calls)`` block left with NO answers.
When a tool call itself kills the gateway process (``docker restart``,
``systemctl restart``, ``kill``, ``hermes gateway restart``), the process
is terminated by SIGKILL *mid-call* before the tool result is ever
written and before the orderly shutdown rewind
(``_drop_trailing_empty_response_scaffolding``) can run. The last thing
persisted is the ``assistant`` message that issued the ``tool_calls``,
with zero matching ``tool`` rows.
On resume the model sees an unanswered tool call at the tail and naturally
re-issues it which restarts the gateway again, producing the infinite
reboot loop in #49201. ``_strip_interrupted_tool_tails`` does not catch
this because there is no tool result to inspect for an interrupt marker.
This strips that dangling tail at the source so there is nothing for the
model to re-execute. It only acts when the tail is an
``assistant(tool_calls)`` whose calls have NO corresponding ``tool``
results a completed assistanttool pair (any tool answers present) is
left untouched so genuine mid-progress tool loops still resume.
"""
if not agent_history:
return agent_history
last = agent_history[-1]
if not (
isinstance(last, dict)
and last.get("role") == "assistant"
and last.get("tool_calls")
):
return agent_history
logger.debug(
"Stripping dangling unanswered assistant(tool_calls) tail "
"(%d call(s)) — process likely killed mid-tool-call by a "
"restart/shutdown command (#49201)",
len(last.get("tool_calls") or []),
)
return agent_history[:-1]
_AUTO_CONTINUE_NOTE_PREFIX = "[System note: Your previous turn"
_AUTO_CONTINUE_FALLBACK_PREFIX = "[System note: A new message"
@ -15701,14 +15752,28 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
else "a gateway interruption"
)
_persist_user_message_override = message
# The empty-message case is the auto-resume startup turn
# synthesized by _schedule_resume_pending_sessions — there is
# no NEW user message to address, so tell the model to report
# recovery instead of the (nonexistent) "new message".
if message:
_resume_guidance = (
"Address the user's NEW message below FIRST and focus "
"on what the user is asking now."
)
else:
_resume_guidance = (
"Report to the user that the session was restored "
"successfully and ask what they would like to do next."
)
message = (
f"[System note: A new message has arrived. The previous turn "
f"was interrupted by {_reason_phrase}. "
f"Address the user's NEW message below FIRST. "
f"[System note: The previous turn was interrupted by "
f"{_reason_phrase}; the gateway is now back online. "
f"Any restart/shutdown command in the history has already "
f"run — do NOT re-execute or verify it. {_resume_guidance} "
f"Do NOT re-execute old tool calls — skip any unfinished "
f"work from the conversation history and focus on what the "
f"user is asking now.]\n\n"
+ message
f"work from the conversation history.]"
+ (f"\n\n{message}" if message else "")
)
elif _has_fresh_tool_tail:
_persist_user_message_override = message

View file

@ -165,6 +165,86 @@ class TestInterruptedReplayFiltering:
assert agent_history[-1]["role"] == "tool"
assert agent_history[-1]["content"] == "deployed successfully"
def test_dangling_unanswered_tool_call_tail_is_removed(self):
"""A trailing assistant(tool_calls) with NO tool answers is stripped.
This is the SIGKILL signature from #49201: the tool itself ran a
restart/shutdown command and killed the gateway before its result was
persisted. The transcript tail is an assistant message with tool_calls
and zero matching tool rows. Without stripping it, the model re-issues
the unanswered call on resume and loops the restart forever.
"""
from gateway.run import _build_gateway_agent_history
history = [
{"role": "user", "content": "restart the container"},
{
"role": "assistant",
"content": None,
"tool_calls": [
{
"id": "call_1",
"function": {
"name": "terminal",
"arguments": '{"command": "docker restart hermes-agent"}',
},
},
],
},
]
agent_history, _observed_context = _build_gateway_agent_history(history)
assert agent_history == [{"role": "user", "content": "restart the container"}]
def test_dangling_tail_after_completed_pair_is_removed_only_at_tail(self):
"""Only the trailing unanswered tool-call block is stripped.
An earlier completed assistanttool pair must survive we only drop
the final assistant(tool_calls) that has no answers.
"""
from gateway.run import _build_gateway_agent_history
history = [
{"role": "user", "content": "do two things"},
{
"role": "assistant",
"content": None,
"tool_calls": [
{"id": "call_1", "function": {"name": "web_search", "arguments": "{}"}},
],
},
{"role": "tool", "tool_call_id": "call_1", "content": "found it"},
{
"role": "assistant",
"content": None,
"tool_calls": [
{
"id": "call_2",
"function": {
"name": "terminal",
"arguments": '{"command": "systemctl restart hermes"}',
},
},
],
},
]
agent_history, _observed_context = _build_gateway_agent_history(history)
# The completed call_1 pair survives; the dangling call_2 tail is gone.
assert agent_history[-1]["role"] == "tool"
assert agent_history[-1]["content"] == "found it"
# The surviving assistant(tool_calls) is the completed call_1 (which
# has a matching tool answer), not the stripped dangling call_2.
_surviving_calls = [
tc.get("id")
for m in agent_history
if m.get("role") == "assistant" and m.get("tool_calls")
for tc in m["tool_calls"]
]
assert _surviving_calls == ["call_1"]
def test_persisted_auto_continue_note_is_not_replayed(self):
from gateway.run import _build_gateway_agent_history

View file

@ -153,14 +153,24 @@ def _simulate_note_injection(
if reason == "shutdown_timeout"
else "a gateway interruption"
)
if message:
resume_guidance = (
"Address the user's NEW message below FIRST and focus "
"on what the user is asking now."
)
else:
resume_guidance = (
"Report to the user that the session was restored "
"successfully and ask what they would like to do next."
)
message = (
f"[System note: A new message has arrived. The previous turn "
f"was interrupted by {reason_phrase}. "
f"Address the user's NEW message below FIRST. "
f"[System note: The previous turn was interrupted by "
f"{reason_phrase}; the gateway is now back online. "
f"Any restart/shutdown command in the history has already "
f"run — do NOT re-execute or verify it. {resume_guidance} "
f"Do NOT re-execute old tool calls — skip any unfinished "
f"work from the conversation history and focus on what the "
f"user is asking now.]\n\n"
+ message
f"work from the conversation history.]"
+ (f"\n\n{message}" if message else "")
)
elif has_fresh_tool_tail:
message = (
@ -654,6 +664,47 @@ class TestResumePendingSystemNote:
result = _simulate_note_injection(history, "ping", resume_entry=None)
assert result == "ping"
def test_resume_pending_note_warns_against_reexecuting_restart(self):
"""The resume-pending note tells the model any restart/shutdown
command in the history already ran and must not be re-executed or
verified the cognitive backstop to the source-level tail strip.
"""
entry = self._pending_entry(reason="restart_timeout")
result = _simulate_note_injection(
history=[
{"role": "assistant", "content": "in progress", "timestamp": time.time()},
],
user_message="restarted!",
resume_entry=entry,
)
assert "[System note:" in result
assert "back online" in result
assert "already" in result and "do NOT re-execute or verify" in result
assert "restarted!" in result
def test_resume_pending_empty_message_reports_recovery(self):
"""On the empty-message auto-resume startup turn there is no NEW user
message, so the note instructs the model to report recovery and ask
for instructions rather than 'address the user's NEW message'.
"""
entry = self._pending_entry(reason="restart_timeout")
result = _simulate_note_injection(
history=[
{"role": "assistant", "content": "in progress", "timestamp": time.time()},
],
user_message="",
resume_entry=entry,
)
assert "[System note:" in result
assert "gateway restart" in result
assert "restored successfully" in result
assert "ask what they would like to do next" in result
assert "do NOT re-execute or verify" in result
# No phantom "NEW message" instruction when there is no new message.
assert "NEW message" not in result
# Nothing appended after the closing bracket (no empty user text).
assert result.rstrip().endswith("]")
# ---------------------------------------------------------------------------
# Freshness helpers