fix(gateway): break the restart loop at the source on session resume

When a tool call itself restarts the gateway (docker restart, systemctl restart, and similar), the process is terminated mid-call — before the tool result is persisted and before the orderly drain rewind can run. The transcript tail is left as an assistant(tool_calls) with no matching tool answer. On resume the model re-issues the unanswered call, taking the gateway down again — an infinite loop (#49201). Source fix: _build_gateway_agent_history now strips a trailing assistant(tool_calls) block that has no tool answers (_strip_dangling_tool_call_tail), so there is nothing for the model to re-execute. This complements _strip_interrupted_tool_tails, which only handles the case where a tool result row exists with an interrupt marker. Cognitive backstop: the resume-pending system note now states that any restart command in the history already ran and must not be re-executed or verified, and the empty-message auto-resume startup turn reports recovery and asks for instructions instead of the nonsensical "address the user's NEW message" (there is no new message on that turn). Reimplements the intent of #49243 by @JoaoMarcos44 at the replay layer. Fixes #49201
2026-06-21 10:22:18 +00:00 · 2026-06-19 16:26:23 -07:00 · 2026-06-19 16:26:23 -07:00 · 75ed07ace8
commit 75ed07ace8
parent 6504f51cd5
3 changed files with 208 additions and 12 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -805,6 +805,13 @@ def _build_gateway_agent_history(
    # tools that were killed mid-flight.
    agent_history = _strip_interrupted_tool_tails(agent_history)

+    # Strip a dangling assistant(tool_calls) tail with no tool answers —
+    # the signature of a SIGKILL mid-tool-call (e.g. the tool itself ran
+    # `docker restart`/`kill` and took the gateway down before the result
+    # was persisted). Without this the model re-issues the unanswered call
+    # on resume and loops the restart forever (#49201).
+    agent_history = _strip_dangling_tool_call_tail(agent_history)
+
    observed_context = "\n".join(observed_group_context).strip() or None
    return agent_history, observed_context

@ -930,6 +937,50 @@ def _strip_interrupted_tool_tails(
    return cleaned


+def _strip_dangling_tool_call_tail(
+    agent_history: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Strip a trailing ``assistant(tool_calls)`` block left with NO answers.
+
+    When a tool call itself kills the gateway process (``docker restart``,
+    ``systemctl restart``, ``kill``, ``hermes gateway restart``), the process
+    is terminated by SIGKILL *mid-call* — before the tool result is ever
+    written and before the orderly shutdown rewind
+    (``_drop_trailing_empty_response_scaffolding``) can run.  The last thing
+    persisted is the ``assistant`` message that issued the ``tool_calls``,
+    with zero matching ``tool`` rows.
+
+    On resume the model sees an unanswered tool call at the tail and naturally
+    re-issues it — which restarts the gateway again, producing the infinite
+    reboot loop in #49201.  ``_strip_interrupted_tool_tails`` does not catch
+    this because there is no tool result to inspect for an interrupt marker.
+
+    This strips that dangling tail at the source so there is nothing for the
+    model to re-execute.  It only acts when the tail is an
+    ``assistant(tool_calls)`` whose calls have NO corresponding ``tool``
+    results — a completed assistant→tool pair (any tool answers present) is
+    left untouched so genuine mid-progress tool loops still resume.
+    """
+    if not agent_history:
+        return agent_history
+
+    last = agent_history[-1]
+    if not (
+        isinstance(last, dict)
+        and last.get("role") == "assistant"
+        and last.get("tool_calls")
+    ):
+        return agent_history
+
+    logger.debug(
+        "Stripping dangling unanswered assistant(tool_calls) tail "
+        "(%d call(s)) — process likely killed mid-tool-call by a "
+        "restart/shutdown command (#49201)",
+        len(last.get("tool_calls") or []),
+    )
+    return agent_history[:-1]
+
+
 _AUTO_CONTINUE_NOTE_PREFIX = "[System note: Your previous turn"
 _AUTO_CONTINUE_FALLBACK_PREFIX = "[System note: A new message"

@ -15701,14 +15752,28 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
                    else "a gateway interruption"
                )
                _persist_user_message_override = message
+                # The empty-message case is the auto-resume startup turn
+                # synthesized by _schedule_resume_pending_sessions — there is
+                # no NEW user message to address, so tell the model to report
+                # recovery instead of the (nonexistent) "new message".
+                if message:
+                    _resume_guidance = (
+                        "Address the user's NEW message below FIRST and focus "
+                        "on what the user is asking now."
+                    )
+                else:
+                    _resume_guidance = (
+                        "Report to the user that the session was restored "
+                        "successfully and ask what they would like to do next."
+                    )
                message = (
-                    f"[System note: A new message has arrived. The previous turn "
-                    f"was interrupted by {_reason_phrase}. "
-                    f"Address the user's NEW message below FIRST. "
+                    f"[System note: The previous turn was interrupted by "
+                    f"{_reason_phrase}; the gateway is now back online. "
+                    f"Any restart/shutdown command in the history has already "
+                    f"run — do NOT re-execute or verify it. {_resume_guidance} "
                    f"Do NOT re-execute old tool calls — skip any unfinished "
-                    f"work from the conversation history and focus on what the "
-                    f"user is asking now.]\n\n"
-                    + message
+                    f"work from the conversation history.]"
+                    + (f"\n\n{message}" if message else "")
                )
            elif _has_fresh_tool_tail:
                _persist_user_message_override = message
--- a/tests/gateway/test_auto_continue.py
+++ b/tests/gateway/test_auto_continue.py
@ -165,6 +165,86 @@ class TestInterruptedReplayFiltering:
        assert agent_history[-1]["role"] == "tool"
        assert agent_history[-1]["content"] == "deployed successfully"

+    def test_dangling_unanswered_tool_call_tail_is_removed(self):
+        """A trailing assistant(tool_calls) with NO tool answers is stripped.
+
+        This is the SIGKILL signature from #49201: the tool itself ran a
+        restart/shutdown command and killed the gateway before its result was
+        persisted. The transcript tail is an assistant message with tool_calls
+        and zero matching tool rows. Without stripping it, the model re-issues
+        the unanswered call on resume and loops the restart forever.
+        """
+        from gateway.run import _build_gateway_agent_history
+
+        history = [
+            {"role": "user", "content": "restart the container"},
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [
+                    {
+                        "id": "call_1",
+                        "function": {
+                            "name": "terminal",
+                            "arguments": '{"command": "docker restart hermes-agent"}',
+                        },
+                    },
+                ],
+            },
+        ]
+
+        agent_history, _observed_context = _build_gateway_agent_history(history)
+
+        assert agent_history == [{"role": "user", "content": "restart the container"}]
+
+    def test_dangling_tail_after_completed_pair_is_removed_only_at_tail(self):
+        """Only the trailing unanswered tool-call block is stripped.
+
+        An earlier completed assistant→tool pair must survive — we only drop
+        the final assistant(tool_calls) that has no answers.
+        """
+        from gateway.run import _build_gateway_agent_history
+
+        history = [
+            {"role": "user", "content": "do two things"},
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [
+                    {"id": "call_1", "function": {"name": "web_search", "arguments": "{}"}},
+                ],
+            },
+            {"role": "tool", "tool_call_id": "call_1", "content": "found it"},
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [
+                    {
+                        "id": "call_2",
+                        "function": {
+                            "name": "terminal",
+                            "arguments": '{"command": "systemctl restart hermes"}',
+                        },
+                    },
+                ],
+            },
+        ]
+
+        agent_history, _observed_context = _build_gateway_agent_history(history)
+
+        # The completed call_1 pair survives; the dangling call_2 tail is gone.
+        assert agent_history[-1]["role"] == "tool"
+        assert agent_history[-1]["content"] == "found it"
+        # The surviving assistant(tool_calls) is the completed call_1 (which
+        # has a matching tool answer), not the stripped dangling call_2.
+        _surviving_calls = [
+            tc.get("id")
+            for m in agent_history
+            if m.get("role") == "assistant" and m.get("tool_calls")
+            for tc in m["tool_calls"]
+        ]
+        assert _surviving_calls == ["call_1"]
+
    def test_persisted_auto_continue_note_is_not_replayed(self):
        from gateway.run import _build_gateway_agent_history

--- a/tests/gateway/test_restart_resume_pending.py
+++ b/tests/gateway/test_restart_resume_pending.py
@ -153,14 +153,24 @@ def _simulate_note_injection(
            if reason == "shutdown_timeout"
            else "a gateway interruption"
        )
+        if message:
+            resume_guidance = (
+                "Address the user's NEW message below FIRST and focus "
+                "on what the user is asking now."
+            )
+        else:
+            resume_guidance = (
+                "Report to the user that the session was restored "
+                "successfully and ask what they would like to do next."
+            )
        message = (
-            f"[System note: A new message has arrived. The previous turn "
-            f"was interrupted by {reason_phrase}. "
-            f"Address the user's NEW message below FIRST. "
+            f"[System note: The previous turn was interrupted by "
+            f"{reason_phrase}; the gateway is now back online. "
+            f"Any restart/shutdown command in the history has already "
+            f"run — do NOT re-execute or verify it. {resume_guidance} "
            f"Do NOT re-execute old tool calls — skip any unfinished "
-            f"work from the conversation history and focus on what the "
-            f"user is asking now.]\n\n"
-            + message
+            f"work from the conversation history.]"
+            + (f"\n\n{message}" if message else "")
        )
    elif has_fresh_tool_tail:
        message = (
@ -654,6 +664,47 @@ class TestResumePendingSystemNote:
        result = _simulate_note_injection(history, "ping", resume_entry=None)
        assert result == "ping"

+    def test_resume_pending_note_warns_against_reexecuting_restart(self):
+        """The resume-pending note tells the model any restart/shutdown
+        command in the history already ran and must not be re-executed or
+        verified — the cognitive backstop to the source-level tail strip.
+        """
+        entry = self._pending_entry(reason="restart_timeout")
+        result = _simulate_note_injection(
+            history=[
+                {"role": "assistant", "content": "in progress", "timestamp": time.time()},
+            ],
+            user_message="restarted!",
+            resume_entry=entry,
+        )
+        assert "[System note:" in result
+        assert "back online" in result
+        assert "already" in result and "do NOT re-execute or verify" in result
+        assert "restarted!" in result
+
+    def test_resume_pending_empty_message_reports_recovery(self):
+        """On the empty-message auto-resume startup turn there is no NEW user
+        message, so the note instructs the model to report recovery and ask
+        for instructions rather than 'address the user's NEW message'.
+        """
+        entry = self._pending_entry(reason="restart_timeout")
+        result = _simulate_note_injection(
+            history=[
+                {"role": "assistant", "content": "in progress", "timestamp": time.time()},
+            ],
+            user_message="",
+            resume_entry=entry,
+        )
+        assert "[System note:" in result
+        assert "gateway restart" in result
+        assert "restored successfully" in result
+        assert "ask what they would like to do next" in result
+        assert "do NOT re-execute or verify" in result
+        # No phantom "NEW message" instruction when there is no new message.
+        assert "NEW message" not in result
+        # Nothing appended after the closing bracket (no empty user text).
+        assert result.rstrip().endswith("]")
+

 # ---------------------------------------------------------------------------
 # Freshness helpers