test(run_agent): fix racy ordering in test_concurrent_handles_tool_error (#42356)

The test keyed the 'which call raises' decision on a shared invocation counter (first call → raise, second → success), then asserted the error landed in messages[0] (c1) and success in messages[1] (c2). But _execute_tool_calls_concurrent runs the two web_search calls on a thread pool with no ordering guarantee — c2's handler can be invoked first, take the 'first call raises' branch, and the error ends up in messages[1]. Results are ordered by tool_call_id, so messages[0] (c1) was then 'success' and the assertion failed. It passed in isolation but reliably failed under CI's full parallel slice (8 xdist workers) where the scheduler actually interleaves the two handlers. Fix: tie the raise to a specific tool call via its arguments (q=boom raises, q=ok succeeds) instead of invocation order, and assert tool_call_id ↔ content pairing explicitly. Deterministic regardless of thread scheduling — verified 10/10 in isolation and the full TestConcurrentToolExecution class (32) green.
2026-07-24 16:54:43 +00:00 · 2026-06-08 14:40:39 -07:00 · 2026-06-08 14:40:39 -07:00 · d6c11a4575
commit d6c11a4575
parent 3f1758d2e4
1 changed files with 13 additions and 6 deletions
--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@ -2402,15 +2402,20 @@ class TestConcurrentToolExecution:

    def test_concurrent_handles_tool_error(self, agent):
        """If one tool raises, others should still complete."""
-        tc1 = _mock_tool_call(name="web_search", arguments='{}', call_id="c1")
-        tc2 = _mock_tool_call(name="web_search", arguments='{}', call_id="c2")
+        # Distinguish the two calls by their arguments so the error is tied to
+        # a SPECIFIC tool call rather than invocation order. Concurrent
+        # execution gives no guarantee that c1's handler runs before c2's, so
+        # keying the raise on a call-order counter is racy: under thread-pool
+        # scheduling c2 could be invoked first, take the "first call raises"
+        # branch, and the error would land in messages[1] instead of
+        # messages[0]. Keying on args makes the assertion deterministic.
+        tc1 = _mock_tool_call(name="web_search", arguments='{"q": "boom"}', call_id="c1")
+        tc2 = _mock_tool_call(name="web_search", arguments='{"q": "ok"}', call_id="c2")
        mock_msg = _mock_assistant_msg(content="", tool_calls=[tc1, tc2])
        messages = []

-        call_count = [0]
        def fake_handle(name, args, task_id, **kwargs):
-            call_count[0] += 1
-            if call_count[0] == 1:
+            if args.get("q") == "boom":
                raise RuntimeError("boom")
            return "success"

@ -2418,9 +2423,11 @@ class TestConcurrentToolExecution:
            agent._execute_tool_calls_concurrent(mock_msg, messages, "task-1")

        assert len(messages) == 2
-        # First tool should have error
+        # Results are ordered by tool_call_id; c1 raised, c2 succeeded.
+        assert messages[0]["tool_call_id"] == "c1"
        assert "Error" in messages[0]["content"] or "boom" in messages[0]["content"]
        # Second tool should succeed
+        assert messages[1]["tool_call_id"] == "c2"
        assert "success" in messages[1]["content"]

    def test_concurrent_interrupt_before_start(self, agent):