From d6c11a4575bc99ffdf2a75212398122fa4aff383 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Mon, 8 Jun 2026 14:40:39 -0700 Subject: [PATCH] test(run_agent): fix racy ordering in test_concurrent_handles_tool_error (#42356) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test keyed the 'which call raises' decision on a shared invocation counter (first call → raise, second → success), then asserted the error landed in messages[0] (c1) and success in messages[1] (c2). But _execute_tool_calls_concurrent runs the two web_search calls on a thread pool with no ordering guarantee — c2's handler can be invoked first, take the 'first call raises' branch, and the error ends up in messages[1]. Results are ordered by tool_call_id, so messages[0] (c1) was then 'success' and the assertion failed. It passed in isolation but reliably failed under CI's full parallel slice (8 xdist workers) where the scheduler actually interleaves the two handlers. Fix: tie the raise to a specific tool call via its arguments (q=boom raises, q=ok succeeds) instead of invocation order, and assert tool_call_id ↔ content pairing explicitly. Deterministic regardless of thread scheduling — verified 10/10 in isolation and the full TestConcurrentToolExecution class (32) green. --- tests/run_agent/test_run_agent.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/run_agent/test_run_agent.py b/tests/run_agent/test_run_agent.py index 72363176d61..d215c7b193a 100644 --- a/tests/run_agent/test_run_agent.py +++ b/tests/run_agent/test_run_agent.py @@ -2402,15 +2402,20 @@ class TestConcurrentToolExecution: def test_concurrent_handles_tool_error(self, agent): """If one tool raises, others should still complete.""" - tc1 = _mock_tool_call(name="web_search", arguments='{}', call_id="c1") - tc2 = _mock_tool_call(name="web_search", arguments='{}', call_id="c2") + # Distinguish the two calls by their arguments so the error is tied to + # a SPECIFIC tool call rather than invocation order. Concurrent + # execution gives no guarantee that c1's handler runs before c2's, so + # keying the raise on a call-order counter is racy: under thread-pool + # scheduling c2 could be invoked first, take the "first call raises" + # branch, and the error would land in messages[1] instead of + # messages[0]. Keying on args makes the assertion deterministic. + tc1 = _mock_tool_call(name="web_search", arguments='{"q": "boom"}', call_id="c1") + tc2 = _mock_tool_call(name="web_search", arguments='{"q": "ok"}', call_id="c2") mock_msg = _mock_assistant_msg(content="", tool_calls=[tc1, tc2]) messages = [] - call_count = [0] def fake_handle(name, args, task_id, **kwargs): - call_count[0] += 1 - if call_count[0] == 1: + if args.get("q") == "boom": raise RuntimeError("boom") return "success" @@ -2418,9 +2423,11 @@ class TestConcurrentToolExecution: agent._execute_tool_calls_concurrent(mock_msg, messages, "task-1") assert len(messages) == 2 - # First tool should have error + # Results are ordered by tool_call_id; c1 raised, c2 succeeded. + assert messages[0]["tool_call_id"] == "c1" assert "Error" in messages[0]["content"] or "boom" in messages[0]["content"] # Second tool should succeed + assert messages[1]["tool_call_id"] == "c2" assert "success" in messages[1]["content"] def test_concurrent_interrupt_before_start(self, agent):