From c8e4dcf412e65b58334ebf9a024e4e7444162828 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Fri, 10 Apr 2026 03:52:16 -0700
Subject: [PATCH] fix: prevent duplicate completion notifications on process
 kill (#7124)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When kill_process() sends SIGTERM, both it and the reader thread race
to call _move_to_finished() — kill_process sets exit_code=-15 and
enqueues a notification, then the reader thread's process.wait()
returns with exit_code=143 (128+SIGTERM) and enqueues a second one.

Fix: make _move_to_finished() idempotent by tracking whether the
session was actually removed from _running. The second call sees it
was already moved and skips the completion_queue.put().

Adds regression test: test_move_to_finished_idempotent_no_duplicate
---
 tests/tools/test_notify_on_complete.py | 20 ++++++++++++++++++++
 tools/process_registry.py              | 16 +++++++++++-----
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/tests/tools/test_notify_on_complete.py b/tests/tools/test_notify_on_complete.py
index 8cf17bfbf6..ff6f14922f 100644
--- a/tests/tools/test_notify_on_complete.py
+++ b/tests/tools/test_notify_on_complete.py
@@ -120,6 +120,26 @@ class TestCompletionQueue:
         assert completion["exit_code"] == 1
         assert "FAILED" in completion["output"]
 
+    def test_move_to_finished_idempotent_no_duplicate(self, registry):
+        """Calling _move_to_finished twice must NOT enqueue two notifications.
+
+        Regression test: kill_process() and the reader thread can both call
+        _move_to_finished() for the same session, producing duplicate
+        [SYSTEM: Background process ...] messages.
+        """
+        s = _make_session(notify_on_complete=True, output="done", exit_code=-15)
+        s.exited = True
+        s.exit_code = -15
+        registry._running[s.id] = s
+        with patch.object(registry, "_write_checkpoint"):
+            registry._move_to_finished(s)  # first call — should enqueue
+            s.exit_code = 143  # reader thread updates exit code
+            registry._move_to_finished(s)  # second call — should be no-op
+
+        assert registry.completion_queue.qsize() == 1
+        completion = registry.completion_queue.get_nowait()
+        assert completion["exit_code"] == -15  # from the first (kill) call
+
     def test_output_truncated_to_2000(self, registry):
         """Long output is truncated to last 2000 chars."""
         long_output = "x" * 5000
diff --git a/tools/process_registry.py b/tools/process_registry.py
index 7f55ae6db6..39d3704b18 100644
--- a/tools/process_registry.py
+++ b/tools/process_registry.py
@@ -484,15 +484,21 @@ class ProcessRegistry:
         self._move_to_finished(session)
 
     def _move_to_finished(self, session: ProcessSession):
-        """Move a session from running to finished."""
+        """Move a session from running to finished.
+
+        Idempotent: if the session was already moved (e.g. kill_process raced
+        with the reader thread), the second call is a no-op — no duplicate
+        completion notification is enqueued.
+        """
         with self._lock:
-            self._running.pop(session.id, None)
+            was_running = self._running.pop(session.id, None) is not None
             self._finished[session.id] = session
         self._write_checkpoint()
 
-        # If the caller requested agent notification, enqueue the completion
-        # so the CLI/gateway can auto-trigger a new agent turn.
-        if session.notify_on_complete:
+        # Only enqueue completion notification on the FIRST move.  Without
+        # this guard, kill_process() and the reader thread can both call
+        # _move_to_finished(), producing duplicate [SYSTEM: ...] messages.
+        if was_running and session.notify_on_complete:
             from tools.ansi_strip import strip_ansi
             output_tail = strip_ansi(session.output_buffer[-2000:]) if session.output_buffer else ""
             self.completion_queue.put({