From c8e4dcf412e65b58334ebf9a024e4e7444162828 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 10 Apr 2026 03:52:16 -0700 Subject: [PATCH] fix: prevent duplicate completion notifications on process kill (#7124) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When kill_process() sends SIGTERM, both it and the reader thread race to call _move_to_finished() — kill_process sets exit_code=-15 and enqueues a notification, then the reader thread's process.wait() returns with exit_code=143 (128+SIGTERM) and enqueues a second one. Fix: make _move_to_finished() idempotent by tracking whether the session was actually removed from _running. The second call sees it was already moved and skips the completion_queue.put(). Adds regression test: test_move_to_finished_idempotent_no_duplicate --- tests/tools/test_notify_on_complete.py | 20 ++++++++++++++++++++ tools/process_registry.py | 16 +++++++++++----- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/tests/tools/test_notify_on_complete.py b/tests/tools/test_notify_on_complete.py index 8cf17bfbf6..ff6f14922f 100644 --- a/tests/tools/test_notify_on_complete.py +++ b/tests/tools/test_notify_on_complete.py @@ -120,6 +120,26 @@ class TestCompletionQueue: assert completion["exit_code"] == 1 assert "FAILED" in completion["output"] + def test_move_to_finished_idempotent_no_duplicate(self, registry): + """Calling _move_to_finished twice must NOT enqueue two notifications. + + Regression test: kill_process() and the reader thread can both call + _move_to_finished() for the same session, producing duplicate + [SYSTEM: Background process ...] messages. + """ + s = _make_session(notify_on_complete=True, output="done", exit_code=-15) + s.exited = True + s.exit_code = -15 + registry._running[s.id] = s + with patch.object(registry, "_write_checkpoint"): + registry._move_to_finished(s) # first call — should enqueue + s.exit_code = 143 # reader thread updates exit code + registry._move_to_finished(s) # second call — should be no-op + + assert registry.completion_queue.qsize() == 1 + completion = registry.completion_queue.get_nowait() + assert completion["exit_code"] == -15 # from the first (kill) call + def test_output_truncated_to_2000(self, registry): """Long output is truncated to last 2000 chars.""" long_output = "x" * 5000 diff --git a/tools/process_registry.py b/tools/process_registry.py index 7f55ae6db6..39d3704b18 100644 --- a/tools/process_registry.py +++ b/tools/process_registry.py @@ -484,15 +484,21 @@ class ProcessRegistry: self._move_to_finished(session) def _move_to_finished(self, session: ProcessSession): - """Move a session from running to finished.""" + """Move a session from running to finished. + + Idempotent: if the session was already moved (e.g. kill_process raced + with the reader thread), the second call is a no-op — no duplicate + completion notification is enqueued. + """ with self._lock: - self._running.pop(session.id, None) + was_running = self._running.pop(session.id, None) is not None self._finished[session.id] = session self._write_checkpoint() - # If the caller requested agent notification, enqueue the completion - # so the CLI/gateway can auto-trigger a new agent turn. - if session.notify_on_complete: + # Only enqueue completion notification on the FIRST move. Without + # this guard, kill_process() and the reader thread can both call + # _move_to_finished(), producing duplicate [SYSTEM: ...] messages. + if was_running and session.notify_on_complete: from tools.ansi_strip import strip_ansi output_tail = strip_ansi(session.output_buffer[-2000:]) if session.output_buffer else "" self.completion_queue.put({