From e7475b15829faa47bf99dd1ebc8d7370e81ddf6a Mon Sep 17 00:00:00 2001
From: Teknium <teknium1@gmail.com>
Date: Tue, 14 Apr 2026 16:55:30 -0700
Subject: [PATCH] feat: auto-continue interrupted agent work after gateway
 restart (#4493)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the gateway restarts mid-agent-work, the session transcript ends
on a tool result the agent never processed. Previously, the user had
to type 'continue' or use /retry (which replays from scratch, losing
all prior work).

Now, when the next user message arrives and the loaded history ends
with role='tool', a system note is prepended:

  [System note: Your previous turn was interrupted before you could
  process the last tool result(s). Please finish processing those
  results and summarize what was accomplished, then address the
  user's new message below.]

This is injected in _run_agent()'s run_sync closure, right before
calling agent.run_conversation(). The agent sees the full history
(including the pending tool results) and the system note, so it can
summarize what was accomplished and then handle the user's new input.

Design decisions:
- No new session flags or schema changes — purely detects trailing
  tool messages in the loaded history
- Works for any restart scenario (clean, crash, SIGTERM, drain timeout)
  as long as the session wasn't suspended (suspended = fresh start)
- The user's actual message is preserved after the note
- If the session WAS suspended (unclean shutdown), the old history is
  abandoned and the user starts fresh — no false auto-continue

Also updates the shutdown notification message from 'Use /retry after
restart to continue' to 'Send any message after restart to resume
where it left off' — which is now accurate.

Test plan:
- 6 new auto-continue tests (trailing tool detection, no false
  positives for assistant/user/empty history, multi-tool, message
  preservation)
- All 13 restart drain tests pass (updated /retry assertion)
---
 gateway/run.py                      | 17 +++++-
 tests/gateway/test_auto_continue.py | 95 +++++++++++++++++++++++++++++
 tests/gateway/test_restart_drain.py |  2 +-
 3 files changed, 112 insertions(+), 2 deletions(-)
 create mode 100644 tests/gateway/test_auto_continue.py

diff --git a/gateway/run.py b/gateway/run.py
index 5c3e5f13c..a83fa2eed 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -1405,7 +1405,7 @@ class GatewayRunner:
         action = "restarting" if self._restart_requested else "shutting down"
         hint = (
             "Your current task will be interrupted. "
-            "Use /retry after restart to continue."
+            "Send any message after restart to resume where it left off."
             if self._restart_requested
             else "Your current task will be interrupted."
         )
@@ -8450,6 +8450,21 @@ class GatewayRunner:
             if _msn:
                 message = _msn + "\n\n" + message
 
+            # Auto-continue: if the loaded history ends with a tool result,
+            # the previous agent turn was interrupted mid-work (gateway
+            # restart, crash, SIGTERM).  Prepend a system note so the model
+            # finishes processing the pending tool results before addressing
+            # the user's new message.  (#4493)
+            if agent_history and agent_history[-1].get("role") == "tool":
+                message = (
+                    "[System note: Your previous turn was interrupted before you could "
+                    "process the last tool result(s). The conversation history contains "
+                    "tool outputs you haven't responded to yet. Please finish processing "
+                    "those results and summarize what was accomplished, then address the "
+                    "user's new message below.]\n\n"
+                    + message
+                )
+
             _approval_session_key = session_key or ""
             _approval_session_token = set_current_session_key(_approval_session_key)
             register_gateway_notify(_approval_session_key, _approval_notify_sync)
diff --git a/tests/gateway/test_auto_continue.py b/tests/gateway/test_auto_continue.py
new file mode 100644
index 000000000..1f44fa6ab
--- /dev/null
+++ b/tests/gateway/test_auto_continue.py
@@ -0,0 +1,95 @@
+"""Tests for the auto-continue feature (#4493).
+
+When the gateway restarts mid-agent-work, the session transcript ends on a
+tool result that the agent never processed.  The auto-continue logic detects
+this and prepends a system note to the next user message so the model
+finishes the interrupted work before addressing the new input.
+"""
+
+import pytest
+
+
+def _simulate_auto_continue(agent_history: list, user_message: str) -> str:
+    """Reproduce the auto-continue injection logic from _run_agent().
+
+    This mirrors the exact code in gateway/run.py so we can test the
+    detection and message transformation without spinning up a full
+    gateway runner.
+    """
+    message = user_message
+    if agent_history and agent_history[-1].get("role") == "tool":
+        message = (
+            "[System note: Your previous turn was interrupted before you could "
+            "process the last tool result(s). The conversation history contains "
+            "tool outputs you haven't responded to yet. Please finish processing "
+            "those results and summarize what was accomplished, then address the "
+            "user's new message below.]\n\n"
+            + message
+        )
+    return message
+
+
+class TestAutoDetection:
+    """Test that trailing tool results are correctly detected."""
+
+    def test_trailing_tool_result_triggers_note(self):
+        history = [
+            {"role": "user", "content": "deploy the app"},
+            {"role": "assistant", "content": None, "tool_calls": [
+                {"id": "call_1", "function": {"name": "terminal", "arguments": "{}"}}
+            ]},
+            {"role": "tool", "tool_call_id": "call_1", "content": "deployed successfully"},
+        ]
+        result = _simulate_auto_continue(history, "what happened?")
+        assert "[System note:" in result
+        assert "interrupted" in result
+        assert "what happened?" in result
+
+    def test_trailing_assistant_message_no_note(self):
+        history = [
+            {"role": "user", "content": "hello"},
+            {"role": "assistant", "content": "Hi there!"},
+        ]
+        result = _simulate_auto_continue(history, "how are you?")
+        assert "[System note:" not in result
+        assert result == "how are you?"
+
+    def test_empty_history_no_note(self):
+        result = _simulate_auto_continue([], "hello")
+        assert result == "hello"
+
+    def test_trailing_user_message_no_note(self):
+        """Shouldn't happen in practice, but ensure no false positive."""
+        history = [
+            {"role": "user", "content": "hello"},
+        ]
+        result = _simulate_auto_continue(history, "hello again")
+        assert result == "hello again"
+
+    def test_multiple_tool_results_still_triggers(self):
+        """Multiple tool calls in a row — last one is still role=tool."""
+        history = [
+            {"role": "user", "content": "search and read"},
+            {"role": "assistant", "content": None, "tool_calls": [
+                {"id": "call_1", "function": {"name": "search", "arguments": "{}"}},
+                {"id": "call_2", "function": {"name": "read", "arguments": "{}"}},
+            ]},
+            {"role": "tool", "tool_call_id": "call_1", "content": "found it"},
+            {"role": "tool", "tool_call_id": "call_2", "content": "file content here"},
+        ]
+        result = _simulate_auto_continue(history, "continue")
+        assert "[System note:" in result
+
+    def test_original_message_preserved_after_note(self):
+        """The user's actual message must appear after the system note."""
+        history = [
+            {"role": "assistant", "content": None, "tool_calls": [
+                {"id": "c1", "function": {"name": "t", "arguments": "{}"}}
+            ]},
+            {"role": "tool", "tool_call_id": "c1", "content": "done"},
+        ]
+        result = _simulate_auto_continue(history, "now do X")
+        # System note comes first, then user's message
+        note_end = result.index("]\n\n")
+        user_msg_start = result.index("now do X")
+        assert user_msg_start > note_end
diff --git a/tests/gateway/test_restart_drain.py b/tests/gateway/test_restart_drain.py
index 732470c12..3607b1e39 100644
--- a/tests/gateway/test_restart_drain.py
+++ b/tests/gateway/test_restart_drain.py
@@ -193,7 +193,7 @@ async def test_shutdown_notification_says_restarting_when_restart_requested():
 
     assert len(adapter.sent) == 1
     assert "restarting" in adapter.sent[0]
-    assert "/retry" in adapter.sent[0]
+    assert "resume" in adapter.sent[0]
 
 
 @pytest.mark.asyncio