From 75ed07ace82a4bc05458ff827f4ce3750af7a323 Mon Sep 17 00:00:00 2001 From: joaomarcos Date: Fri, 19 Jun 2026 16:26:23 -0700 Subject: [PATCH] fix(gateway): break the restart loop at the source on session resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a tool call itself restarts the gateway (docker restart, systemctl restart, and similar), the process is terminated mid-call — before the tool result is persisted and before the orderly drain rewind can run. The transcript tail is left as an assistant(tool_calls) with no matching tool answer. On resume the model re-issues the unanswered call, taking the gateway down again — an infinite loop (#49201). Source fix: _build_gateway_agent_history now strips a trailing assistant(tool_calls) block that has no tool answers (_strip_dangling_tool_call_tail), so there is nothing for the model to re-execute. This complements _strip_interrupted_tool_tails, which only handles the case where a tool result row exists with an interrupt marker. Cognitive backstop: the resume-pending system note now states that any restart command in the history already ran and must not be re-executed or verified, and the empty-message auto-resume startup turn reports recovery and asks for instructions instead of the nonsensical "address the user's NEW message" (there is no new message on that turn). Reimplements the intent of #49243 by @JoaoMarcos44 at the replay layer. Fixes #49201 --- gateway/run.py | 77 +++++++++++++++++-- tests/gateway/test_auto_continue.py | 80 ++++++++++++++++++++ tests/gateway/test_restart_resume_pending.py | 63 +++++++++++++-- 3 files changed, 208 insertions(+), 12 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index 2672ab43e95..673ec3e3994 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -805,6 +805,13 @@ def _build_gateway_agent_history( # tools that were killed mid-flight. agent_history = _strip_interrupted_tool_tails(agent_history) + # Strip a dangling assistant(tool_calls) tail with no tool answers — + # the signature of a SIGKILL mid-tool-call (e.g. the tool itself ran + # `docker restart`/`kill` and took the gateway down before the result + # was persisted). Without this the model re-issues the unanswered call + # on resume and loops the restart forever (#49201). + agent_history = _strip_dangling_tool_call_tail(agent_history) + observed_context = "\n".join(observed_group_context).strip() or None return agent_history, observed_context @@ -930,6 +937,50 @@ def _strip_interrupted_tool_tails( return cleaned +def _strip_dangling_tool_call_tail( + agent_history: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + """Strip a trailing ``assistant(tool_calls)`` block left with NO answers. + + When a tool call itself kills the gateway process (``docker restart``, + ``systemctl restart``, ``kill``, ``hermes gateway restart``), the process + is terminated by SIGKILL *mid-call* — before the tool result is ever + written and before the orderly shutdown rewind + (``_drop_trailing_empty_response_scaffolding``) can run. The last thing + persisted is the ``assistant`` message that issued the ``tool_calls``, + with zero matching ``tool`` rows. + + On resume the model sees an unanswered tool call at the tail and naturally + re-issues it — which restarts the gateway again, producing the infinite + reboot loop in #49201. ``_strip_interrupted_tool_tails`` does not catch + this because there is no tool result to inspect for an interrupt marker. + + This strips that dangling tail at the source so there is nothing for the + model to re-execute. It only acts when the tail is an + ``assistant(tool_calls)`` whose calls have NO corresponding ``tool`` + results — a completed assistant→tool pair (any tool answers present) is + left untouched so genuine mid-progress tool loops still resume. + """ + if not agent_history: + return agent_history + + last = agent_history[-1] + if not ( + isinstance(last, dict) + and last.get("role") == "assistant" + and last.get("tool_calls") + ): + return agent_history + + logger.debug( + "Stripping dangling unanswered assistant(tool_calls) tail " + "(%d call(s)) — process likely killed mid-tool-call by a " + "restart/shutdown command (#49201)", + len(last.get("tool_calls") or []), + ) + return agent_history[:-1] + + _AUTO_CONTINUE_NOTE_PREFIX = "[System note: Your previous turn" _AUTO_CONTINUE_FALLBACK_PREFIX = "[System note: A new message" @@ -15701,14 +15752,28 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew else "a gateway interruption" ) _persist_user_message_override = message + # The empty-message case is the auto-resume startup turn + # synthesized by _schedule_resume_pending_sessions — there is + # no NEW user message to address, so tell the model to report + # recovery instead of the (nonexistent) "new message". + if message: + _resume_guidance = ( + "Address the user's NEW message below FIRST and focus " + "on what the user is asking now." + ) + else: + _resume_guidance = ( + "Report to the user that the session was restored " + "successfully and ask what they would like to do next." + ) message = ( - f"[System note: A new message has arrived. The previous turn " - f"was interrupted by {_reason_phrase}. " - f"Address the user's NEW message below FIRST. " + f"[System note: The previous turn was interrupted by " + f"{_reason_phrase}; the gateway is now back online. " + f"Any restart/shutdown command in the history has already " + f"run — do NOT re-execute or verify it. {_resume_guidance} " f"Do NOT re-execute old tool calls — skip any unfinished " - f"work from the conversation history and focus on what the " - f"user is asking now.]\n\n" - + message + f"work from the conversation history.]" + + (f"\n\n{message}" if message else "") ) elif _has_fresh_tool_tail: _persist_user_message_override = message diff --git a/tests/gateway/test_auto_continue.py b/tests/gateway/test_auto_continue.py index de3b738944b..c1917a971a9 100644 --- a/tests/gateway/test_auto_continue.py +++ b/tests/gateway/test_auto_continue.py @@ -165,6 +165,86 @@ class TestInterruptedReplayFiltering: assert agent_history[-1]["role"] == "tool" assert agent_history[-1]["content"] == "deployed successfully" + def test_dangling_unanswered_tool_call_tail_is_removed(self): + """A trailing assistant(tool_calls) with NO tool answers is stripped. + + This is the SIGKILL signature from #49201: the tool itself ran a + restart/shutdown command and killed the gateway before its result was + persisted. The transcript tail is an assistant message with tool_calls + and zero matching tool rows. Without stripping it, the model re-issues + the unanswered call on resume and loops the restart forever. + """ + from gateway.run import _build_gateway_agent_history + + history = [ + {"role": "user", "content": "restart the container"}, + { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_1", + "function": { + "name": "terminal", + "arguments": '{"command": "docker restart hermes-agent"}', + }, + }, + ], + }, + ] + + agent_history, _observed_context = _build_gateway_agent_history(history) + + assert agent_history == [{"role": "user", "content": "restart the container"}] + + def test_dangling_tail_after_completed_pair_is_removed_only_at_tail(self): + """Only the trailing unanswered tool-call block is stripped. + + An earlier completed assistant→tool pair must survive — we only drop + the final assistant(tool_calls) that has no answers. + """ + from gateway.run import _build_gateway_agent_history + + history = [ + {"role": "user", "content": "do two things"}, + { + "role": "assistant", + "content": None, + "tool_calls": [ + {"id": "call_1", "function": {"name": "web_search", "arguments": "{}"}}, + ], + }, + {"role": "tool", "tool_call_id": "call_1", "content": "found it"}, + { + "role": "assistant", + "content": None, + "tool_calls": [ + { + "id": "call_2", + "function": { + "name": "terminal", + "arguments": '{"command": "systemctl restart hermes"}', + }, + }, + ], + }, + ] + + agent_history, _observed_context = _build_gateway_agent_history(history) + + # The completed call_1 pair survives; the dangling call_2 tail is gone. + assert agent_history[-1]["role"] == "tool" + assert agent_history[-1]["content"] == "found it" + # The surviving assistant(tool_calls) is the completed call_1 (which + # has a matching tool answer), not the stripped dangling call_2. + _surviving_calls = [ + tc.get("id") + for m in agent_history + if m.get("role") == "assistant" and m.get("tool_calls") + for tc in m["tool_calls"] + ] + assert _surviving_calls == ["call_1"] + def test_persisted_auto_continue_note_is_not_replayed(self): from gateway.run import _build_gateway_agent_history diff --git a/tests/gateway/test_restart_resume_pending.py b/tests/gateway/test_restart_resume_pending.py index 0974b26b4ec..0151551695b 100644 --- a/tests/gateway/test_restart_resume_pending.py +++ b/tests/gateway/test_restart_resume_pending.py @@ -153,14 +153,24 @@ def _simulate_note_injection( if reason == "shutdown_timeout" else "a gateway interruption" ) + if message: + resume_guidance = ( + "Address the user's NEW message below FIRST and focus " + "on what the user is asking now." + ) + else: + resume_guidance = ( + "Report to the user that the session was restored " + "successfully and ask what they would like to do next." + ) message = ( - f"[System note: A new message has arrived. The previous turn " - f"was interrupted by {reason_phrase}. " - f"Address the user's NEW message below FIRST. " + f"[System note: The previous turn was interrupted by " + f"{reason_phrase}; the gateway is now back online. " + f"Any restart/shutdown command in the history has already " + f"run — do NOT re-execute or verify it. {resume_guidance} " f"Do NOT re-execute old tool calls — skip any unfinished " - f"work from the conversation history and focus on what the " - f"user is asking now.]\n\n" - + message + f"work from the conversation history.]" + + (f"\n\n{message}" if message else "") ) elif has_fresh_tool_tail: message = ( @@ -654,6 +664,47 @@ class TestResumePendingSystemNote: result = _simulate_note_injection(history, "ping", resume_entry=None) assert result == "ping" + def test_resume_pending_note_warns_against_reexecuting_restart(self): + """The resume-pending note tells the model any restart/shutdown + command in the history already ran and must not be re-executed or + verified — the cognitive backstop to the source-level tail strip. + """ + entry = self._pending_entry(reason="restart_timeout") + result = _simulate_note_injection( + history=[ + {"role": "assistant", "content": "in progress", "timestamp": time.time()}, + ], + user_message="restarted!", + resume_entry=entry, + ) + assert "[System note:" in result + assert "back online" in result + assert "already" in result and "do NOT re-execute or verify" in result + assert "restarted!" in result + + def test_resume_pending_empty_message_reports_recovery(self): + """On the empty-message auto-resume startup turn there is no NEW user + message, so the note instructs the model to report recovery and ask + for instructions rather than 'address the user's NEW message'. + """ + entry = self._pending_entry(reason="restart_timeout") + result = _simulate_note_injection( + history=[ + {"role": "assistant", "content": "in progress", "timestamp": time.time()}, + ], + user_message="", + resume_entry=entry, + ) + assert "[System note:" in result + assert "gateway restart" in result + assert "restored successfully" in result + assert "ask what they would like to do next" in result + assert "do NOT re-execute or verify" in result + # No phantom "NEW message" instruction when there is no new message. + assert "NEW message" not in result + # Nothing appended after the closing bracket (no empty user text). + assert result.rstrip().endswith("]") + # --------------------------------------------------------------------------- # Freshness helpers