fix: use per-thread persistent event loops in worker threads

Replace asyncio.run() with thread-local persistent event loops for worker threads (e.g., delegate_task's ThreadPoolExecutor). asyncio.run() creates and closes a fresh loop on every call, leaving cached httpx/AsyncOpenAI clients bound to a dead loop — causing 'Event loop is closed' errors during GC when parallel subagents clean up connections. The fix mirrors the main thread's _get_tool_loop() pattern but uses threading.local() so each worker thread gets its own long-lived loop, avoiding both cross-thread contention and the create-destroy lifecycle. Added 4 regression tests covering worker loop persistence, reuse, per-thread isolation, and separation from the main thread's loop.
2026-04-25 00:51:20 +00:00 · 2026-03-20 15:41:06 -04:00 · 2026-03-20 15:41:06 -04:00 · ab6abc2c13
commit ab6abc2c13
parent aafe86d81a
2 changed files with 130 additions and 7 deletions
--- a/model_tools.py
+++ b/model_tools.py
@ -39,6 +39,7 @@ logger = logging.getLogger(__name__)

 _tool_loop = None          # persistent loop for the main (CLI) thread
 _tool_loop_lock = threading.Lock()
+_worker_thread_local = threading.local()  # per-worker-thread persistent loops


 def _get_tool_loop():
@ -56,6 +57,28 @@ def _get_tool_loop():
        return _tool_loop


+def _get_worker_loop():
+    """Return a persistent event loop for the current worker thread.
+
+    Each worker thread (e.g., delegate_task's ThreadPoolExecutor threads)
+    gets its own long-lived loop stored in thread-local storage.  This
+    prevents the "Event loop is closed" errors that occurred when
+    asyncio.run() was used per-call: asyncio.run() creates a loop, runs
+    the coroutine, then *closes* the loop — but cached httpx/AsyncOpenAI
+    clients remain bound to that now-dead loop and raise RuntimeError
+    during garbage collection or subsequent use.
+
+    By keeping the loop alive for the thread's lifetime, cached clients
+    stay valid and their cleanup runs on a live loop.
+    """
+    loop = getattr(_worker_thread_local, 'loop', None)
+    if loop is None or loop.is_closed():
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        _worker_thread_local.loop = loop
+    return loop
+
+
 def _run_async(coro):
    """Run an async coroutine from a sync context.

@ -68,9 +91,10 @@ def _run_async(coro):
    loop so that cached async clients (httpx / AsyncOpenAI) remain bound
    to a live loop and don't trigger "Event loop is closed" on GC.

-    When called from a worker thread (parallel tool execution), we detect
-    that we're NOT on the main thread and use asyncio.run() with a fresh
-    loop to avoid contention on the shared persistent loop.
+    When called from a worker thread (parallel tool execution), we use a
+    per-thread persistent loop to avoid both contention with the main
+    thread's shared loop AND the "Event loop is closed" errors caused by
+    asyncio.run()'s create-and-destroy lifecycle.

    This is the single source of truth for sync->async bridging in tool
    handlers. The RL paths (agent_loop.py, tool_context.py) also provide
@ -89,11 +113,14 @@ def _run_async(coro):
            future = pool.submit(asyncio.run, coro)
            return future.result(timeout=300)

-    # If we're on a worker thread (e.g., parallel tool execution),
-    # use asyncio.run() with its own loop to avoid contending with the
-    # shared persistent loop from another parallel worker.
+    # If we're on a worker thread (e.g., parallel tool execution in
+    # delegate_task), use a per-thread persistent loop.  This avoids
+    # contention with the main thread's shared loop while keeping cached
+    # httpx/AsyncOpenAI clients bound to a live loop for the thread's
+    # lifetime — preventing "Event loop is closed" on GC cleanup.
    if threading.current_thread() is not threading.main_thread():
-        return asyncio.run(coro)
+        worker_loop = _get_worker_loop()
+        return worker_loop.run_until_complete(coro)

    tool_loop = _get_tool_loop()
    return tool_loop.run_until_complete(coro)