From d6c11a4575bc99ffdf2a75212398122fa4aff383 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Mon, 8 Jun 2026 14:40:39 -0700
Subject: [PATCH] test(run_agent): fix racy ordering in
 test_concurrent_handles_tool_error (#42356)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The test keyed the 'which call raises' decision on a shared invocation
counter (first call → raise, second → success), then asserted the error
landed in messages[0] (c1) and success in messages[1] (c2). But
_execute_tool_calls_concurrent runs the two web_search calls on a thread
pool with no ordering guarantee — c2's handler can be invoked first, take
the 'first call raises' branch, and the error ends up in messages[1].
Results are ordered by tool_call_id, so messages[0] (c1) was then 'success'
and the assertion failed.

It passed in isolation but reliably failed under CI's full parallel slice
(8 xdist workers) where the scheduler actually interleaves the two handlers.

Fix: tie the raise to a specific tool call via its arguments (q=boom raises,
q=ok succeeds) instead of invocation order, and assert tool_call_id ↔ content
pairing explicitly. Deterministic regardless of thread scheduling — verified
10/10 in isolation and the full TestConcurrentToolExecution class (32) green.
---
 tests/run_agent/test_run_agent.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py
index 72363176d61..d215c7b193a 100644
--- a/tests/run_agent/test_run_agent.py
+++ b/tests/run_agent/test_run_agent.py
@@ -2402,15 +2402,20 @@ class TestConcurrentToolExecution:
 
     def test_concurrent_handles_tool_error(self, agent):
         """If one tool raises, others should still complete."""
-        tc1 = _mock_tool_call(name="web_search", arguments='{}', call_id="c1")
-        tc2 = _mock_tool_call(name="web_search", arguments='{}', call_id="c2")
+        # Distinguish the two calls by their arguments so the error is tied to
+        # a SPECIFIC tool call rather than invocation order. Concurrent
+        # execution gives no guarantee that c1's handler runs before c2's, so
+        # keying the raise on a call-order counter is racy: under thread-pool
+        # scheduling c2 could be invoked first, take the "first call raises"
+        # branch, and the error would land in messages[1] instead of
+        # messages[0]. Keying on args makes the assertion deterministic.
+        tc1 = _mock_tool_call(name="web_search", arguments='{"q": "boom"}', call_id="c1")
+        tc2 = _mock_tool_call(name="web_search", arguments='{"q": "ok"}', call_id="c2")
         mock_msg = _mock_assistant_msg(content="", tool_calls=[tc1, tc2])
         messages = []
 
-        call_count = [0]
         def fake_handle(name, args, task_id, **kwargs):
-            call_count[0] += 1
-            if call_count[0] == 1:
+            if args.get("q") == "boom":
                 raise RuntimeError("boom")
             return "success"
 
@@ -2418,9 +2423,11 @@ class TestConcurrentToolExecution:
             agent._execute_tool_calls_concurrent(mock_msg, messages, "task-1")
 
         assert len(messages) == 2
-        # First tool should have error
+        # Results are ordered by tool_call_id; c1 raised, c2 succeeded.
+        assert messages[0]["tool_call_id"] == "c1"
         assert "Error" in messages[0]["content"] or "boom" in messages[0]["content"]
         # Second tool should succeed
+        assert messages[1]["tool_call_id"] == "c2"
         assert "success" in messages[1]["content"]
 
     def test_concurrent_interrupt_before_start(self, agent):