fix: prevent duplicate completion notifications on process kill (#7124)

When kill_process() sends SIGTERM, both it and the reader thread race to call _move_to_finished() — kill_process sets exit_code=-15 and enqueues a notification, then the reader thread's process.wait() returns with exit_code=143 (128+SIGTERM) and enqueues a second one. Fix: make _move_to_finished() idempotent by tracking whether the session was actually removed from _running. The second call sees it was already moved and skips the completion_queue.put(). Adds regression test: test_move_to_finished_idempotent_no_duplicate
2026-04-25 00:51:20 +00:00 · 2026-04-10 03:52:16 -07:00 · 2026-04-10 03:52:16 -07:00 · c8e4dcf412
commit c8e4dcf412
parent 00dd5cc491
2 changed files with 31 additions and 5 deletions
--- a/tests/tools/test_notify_on_complete.py
+++ b/tests/tools/test_notify_on_complete.py
@ -120,6 +120,26 @@ class TestCompletionQueue:
        assert completion["exit_code"] == 1
        assert "FAILED" in completion["output"]

+    def test_move_to_finished_idempotent_no_duplicate(self, registry):
+        """Calling _move_to_finished twice must NOT enqueue two notifications.
+
+        Regression test: kill_process() and the reader thread can both call
+        _move_to_finished() for the same session, producing duplicate
+        [SYSTEM: Background process ...] messages.
+        """
+        s = _make_session(notify_on_complete=True, output="done", exit_code=-15)
+        s.exited = True
+        s.exit_code = -15
+        registry._running[s.id] = s
+        with patch.object(registry, "_write_checkpoint"):
+            registry._move_to_finished(s)  # first call — should enqueue
+            s.exit_code = 143  # reader thread updates exit code
+            registry._move_to_finished(s)  # second call — should be no-op
+
+        assert registry.completion_queue.qsize() == 1
+        completion = registry.completion_queue.get_nowait()
+        assert completion["exit_code"] == -15  # from the first (kill) call
+
    def test_output_truncated_to_2000(self, registry):
        """Long output is truncated to last 2000 chars."""
        long_output = "x" * 5000