mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-21 10:22:18 +00:00
fix(gateway): break the restart loop at the source on session resume
When a tool call itself restarts the gateway (docker restart, systemctl restart, and similar), the process is terminated mid-call — before the tool result is persisted and before the orderly drain rewind can run. The transcript tail is left as an assistant(tool_calls) with no matching tool answer. On resume the model re-issues the unanswered call, taking the gateway down again — an infinite loop (#49201). Source fix: _build_gateway_agent_history now strips a trailing assistant(tool_calls) block that has no tool answers (_strip_dangling_tool_call_tail), so there is nothing for the model to re-execute. This complements _strip_interrupted_tool_tails, which only handles the case where a tool result row exists with an interrupt marker. Cognitive backstop: the resume-pending system note now states that any restart command in the history already ran and must not be re-executed or verified, and the empty-message auto-resume startup turn reports recovery and asks for instructions instead of the nonsensical "address the user's NEW message" (there is no new message on that turn). Reimplements the intent of #49243 by @JoaoMarcos44 at the replay layer. Fixes #49201
This commit is contained in:
parent
6504f51cd5
commit
75ed07ace8
3 changed files with 208 additions and 12 deletions
|
|
@ -805,6 +805,13 @@ def _build_gateway_agent_history(
|
|||
# tools that were killed mid-flight.
|
||||
agent_history = _strip_interrupted_tool_tails(agent_history)
|
||||
|
||||
# Strip a dangling assistant(tool_calls) tail with no tool answers —
|
||||
# the signature of a SIGKILL mid-tool-call (e.g. the tool itself ran
|
||||
# `docker restart`/`kill` and took the gateway down before the result
|
||||
# was persisted). Without this the model re-issues the unanswered call
|
||||
# on resume and loops the restart forever (#49201).
|
||||
agent_history = _strip_dangling_tool_call_tail(agent_history)
|
||||
|
||||
observed_context = "\n".join(observed_group_context).strip() or None
|
||||
return agent_history, observed_context
|
||||
|
||||
|
|
@ -930,6 +937,50 @@ def _strip_interrupted_tool_tails(
|
|||
return cleaned
|
||||
|
||||
|
||||
def _strip_dangling_tool_call_tail(
|
||||
agent_history: List[Dict[str, Any]],
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Strip a trailing ``assistant(tool_calls)`` block left with NO answers.
|
||||
|
||||
When a tool call itself kills the gateway process (``docker restart``,
|
||||
``systemctl restart``, ``kill``, ``hermes gateway restart``), the process
|
||||
is terminated by SIGKILL *mid-call* — before the tool result is ever
|
||||
written and before the orderly shutdown rewind
|
||||
(``_drop_trailing_empty_response_scaffolding``) can run. The last thing
|
||||
persisted is the ``assistant`` message that issued the ``tool_calls``,
|
||||
with zero matching ``tool`` rows.
|
||||
|
||||
On resume the model sees an unanswered tool call at the tail and naturally
|
||||
re-issues it — which restarts the gateway again, producing the infinite
|
||||
reboot loop in #49201. ``_strip_interrupted_tool_tails`` does not catch
|
||||
this because there is no tool result to inspect for an interrupt marker.
|
||||
|
||||
This strips that dangling tail at the source so there is nothing for the
|
||||
model to re-execute. It only acts when the tail is an
|
||||
``assistant(tool_calls)`` whose calls have NO corresponding ``tool``
|
||||
results — a completed assistant→tool pair (any tool answers present) is
|
||||
left untouched so genuine mid-progress tool loops still resume.
|
||||
"""
|
||||
if not agent_history:
|
||||
return agent_history
|
||||
|
||||
last = agent_history[-1]
|
||||
if not (
|
||||
isinstance(last, dict)
|
||||
and last.get("role") == "assistant"
|
||||
and last.get("tool_calls")
|
||||
):
|
||||
return agent_history
|
||||
|
||||
logger.debug(
|
||||
"Stripping dangling unanswered assistant(tool_calls) tail "
|
||||
"(%d call(s)) — process likely killed mid-tool-call by a "
|
||||
"restart/shutdown command (#49201)",
|
||||
len(last.get("tool_calls") or []),
|
||||
)
|
||||
return agent_history[:-1]
|
||||
|
||||
|
||||
_AUTO_CONTINUE_NOTE_PREFIX = "[System note: Your previous turn"
|
||||
_AUTO_CONTINUE_FALLBACK_PREFIX = "[System note: A new message"
|
||||
|
||||
|
|
@ -15701,14 +15752,28 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
else "a gateway interruption"
|
||||
)
|
||||
_persist_user_message_override = message
|
||||
# The empty-message case is the auto-resume startup turn
|
||||
# synthesized by _schedule_resume_pending_sessions — there is
|
||||
# no NEW user message to address, so tell the model to report
|
||||
# recovery instead of the (nonexistent) "new message".
|
||||
if message:
|
||||
_resume_guidance = (
|
||||
"Address the user's NEW message below FIRST and focus "
|
||||
"on what the user is asking now."
|
||||
)
|
||||
else:
|
||||
_resume_guidance = (
|
||||
"Report to the user that the session was restored "
|
||||
"successfully and ask what they would like to do next."
|
||||
)
|
||||
message = (
|
||||
f"[System note: A new message has arrived. The previous turn "
|
||||
f"was interrupted by {_reason_phrase}. "
|
||||
f"Address the user's NEW message below FIRST. "
|
||||
f"[System note: The previous turn was interrupted by "
|
||||
f"{_reason_phrase}; the gateway is now back online. "
|
||||
f"Any restart/shutdown command in the history has already "
|
||||
f"run — do NOT re-execute or verify it. {_resume_guidance} "
|
||||
f"Do NOT re-execute old tool calls — skip any unfinished "
|
||||
f"work from the conversation history and focus on what the "
|
||||
f"user is asking now.]\n\n"
|
||||
+ message
|
||||
f"work from the conversation history.]"
|
||||
+ (f"\n\n{message}" if message else "")
|
||||
)
|
||||
elif _has_fresh_tool_tail:
|
||||
_persist_user_message_override = message
|
||||
|
|
|
|||
|
|
@ -165,6 +165,86 @@ class TestInterruptedReplayFiltering:
|
|||
assert agent_history[-1]["role"] == "tool"
|
||||
assert agent_history[-1]["content"] == "deployed successfully"
|
||||
|
||||
def test_dangling_unanswered_tool_call_tail_is_removed(self):
|
||||
"""A trailing assistant(tool_calls) with NO tool answers is stripped.
|
||||
|
||||
This is the SIGKILL signature from #49201: the tool itself ran a
|
||||
restart/shutdown command and killed the gateway before its result was
|
||||
persisted. The transcript tail is an assistant message with tool_calls
|
||||
and zero matching tool rows. Without stripping it, the model re-issues
|
||||
the unanswered call on resume and loops the restart forever.
|
||||
"""
|
||||
from gateway.run import _build_gateway_agent_history
|
||||
|
||||
history = [
|
||||
{"role": "user", "content": "restart the container"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": "call_1",
|
||||
"function": {
|
||||
"name": "terminal",
|
||||
"arguments": '{"command": "docker restart hermes-agent"}',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
agent_history, _observed_context = _build_gateway_agent_history(history)
|
||||
|
||||
assert agent_history == [{"role": "user", "content": "restart the container"}]
|
||||
|
||||
def test_dangling_tail_after_completed_pair_is_removed_only_at_tail(self):
|
||||
"""Only the trailing unanswered tool-call block is stripped.
|
||||
|
||||
An earlier completed assistant→tool pair must survive — we only drop
|
||||
the final assistant(tool_calls) that has no answers.
|
||||
"""
|
||||
from gateway.run import _build_gateway_agent_history
|
||||
|
||||
history = [
|
||||
{"role": "user", "content": "do two things"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
"tool_calls": [
|
||||
{"id": "call_1", "function": {"name": "web_search", "arguments": "{}"}},
|
||||
],
|
||||
},
|
||||
{"role": "tool", "tool_call_id": "call_1", "content": "found it"},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": None,
|
||||
"tool_calls": [
|
||||
{
|
||||
"id": "call_2",
|
||||
"function": {
|
||||
"name": "terminal",
|
||||
"arguments": '{"command": "systemctl restart hermes"}',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
agent_history, _observed_context = _build_gateway_agent_history(history)
|
||||
|
||||
# The completed call_1 pair survives; the dangling call_2 tail is gone.
|
||||
assert agent_history[-1]["role"] == "tool"
|
||||
assert agent_history[-1]["content"] == "found it"
|
||||
# The surviving assistant(tool_calls) is the completed call_1 (which
|
||||
# has a matching tool answer), not the stripped dangling call_2.
|
||||
_surviving_calls = [
|
||||
tc.get("id")
|
||||
for m in agent_history
|
||||
if m.get("role") == "assistant" and m.get("tool_calls")
|
||||
for tc in m["tool_calls"]
|
||||
]
|
||||
assert _surviving_calls == ["call_1"]
|
||||
|
||||
def test_persisted_auto_continue_note_is_not_replayed(self):
|
||||
from gateway.run import _build_gateway_agent_history
|
||||
|
||||
|
|
|
|||
|
|
@ -153,14 +153,24 @@ def _simulate_note_injection(
|
|||
if reason == "shutdown_timeout"
|
||||
else "a gateway interruption"
|
||||
)
|
||||
if message:
|
||||
resume_guidance = (
|
||||
"Address the user's NEW message below FIRST and focus "
|
||||
"on what the user is asking now."
|
||||
)
|
||||
else:
|
||||
resume_guidance = (
|
||||
"Report to the user that the session was restored "
|
||||
"successfully and ask what they would like to do next."
|
||||
)
|
||||
message = (
|
||||
f"[System note: A new message has arrived. The previous turn "
|
||||
f"was interrupted by {reason_phrase}. "
|
||||
f"Address the user's NEW message below FIRST. "
|
||||
f"[System note: The previous turn was interrupted by "
|
||||
f"{reason_phrase}; the gateway is now back online. "
|
||||
f"Any restart/shutdown command in the history has already "
|
||||
f"run — do NOT re-execute or verify it. {resume_guidance} "
|
||||
f"Do NOT re-execute old tool calls — skip any unfinished "
|
||||
f"work from the conversation history and focus on what the "
|
||||
f"user is asking now.]\n\n"
|
||||
+ message
|
||||
f"work from the conversation history.]"
|
||||
+ (f"\n\n{message}" if message else "")
|
||||
)
|
||||
elif has_fresh_tool_tail:
|
||||
message = (
|
||||
|
|
@ -654,6 +664,47 @@ class TestResumePendingSystemNote:
|
|||
result = _simulate_note_injection(history, "ping", resume_entry=None)
|
||||
assert result == "ping"
|
||||
|
||||
def test_resume_pending_note_warns_against_reexecuting_restart(self):
|
||||
"""The resume-pending note tells the model any restart/shutdown
|
||||
command in the history already ran and must not be re-executed or
|
||||
verified — the cognitive backstop to the source-level tail strip.
|
||||
"""
|
||||
entry = self._pending_entry(reason="restart_timeout")
|
||||
result = _simulate_note_injection(
|
||||
history=[
|
||||
{"role": "assistant", "content": "in progress", "timestamp": time.time()},
|
||||
],
|
||||
user_message="restarted!",
|
||||
resume_entry=entry,
|
||||
)
|
||||
assert "[System note:" in result
|
||||
assert "back online" in result
|
||||
assert "already" in result and "do NOT re-execute or verify" in result
|
||||
assert "restarted!" in result
|
||||
|
||||
def test_resume_pending_empty_message_reports_recovery(self):
|
||||
"""On the empty-message auto-resume startup turn there is no NEW user
|
||||
message, so the note instructs the model to report recovery and ask
|
||||
for instructions rather than 'address the user's NEW message'.
|
||||
"""
|
||||
entry = self._pending_entry(reason="restart_timeout")
|
||||
result = _simulate_note_injection(
|
||||
history=[
|
||||
{"role": "assistant", "content": "in progress", "timestamp": time.time()},
|
||||
],
|
||||
user_message="",
|
||||
resume_entry=entry,
|
||||
)
|
||||
assert "[System note:" in result
|
||||
assert "gateway restart" in result
|
||||
assert "restored successfully" in result
|
||||
assert "ask what they would like to do next" in result
|
||||
assert "do NOT re-execute or verify" in result
|
||||
# No phantom "NEW message" instruction when there is no new message.
|
||||
assert "NEW message" not in result
|
||||
# Nothing appended after the closing bracket (no empty user text).
|
||||
assert result.rstrip().endswith("]")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Freshness helpers
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue