fix(async): close unscheduled coroutines in all threadsafe bridges (#26584)

Wraps every sync->async coroutine-scheduling site in the codebase with a
new agent.async_utils.safe_schedule_threadsafe() helper that closes the
coroutine on scheduling failure (closed loop, shutdown race, etc.)
instead of leaking it as 'coroutine was never awaited' RuntimeWarnings
plus reference leaks.

22 production call sites migrated across the codebase:
- acp_adapter/events.py, acp_adapter/permissions.py
- agent/lsp/manager.py
- cron/scheduler.py (media + text delivery paths)
- gateway/platforms/feishu.py (5 sites, via existing _submit_on_loop helper
  which now delegates to safe_schedule_threadsafe)
- gateway/run.py (10 sites: telegram rename, agent:step hook, status
  callback, interim+bg-review, clarify send, exec-approval button+text,
  temp-bubble cleanup, channel-directory refresh)
- plugins/memory/hindsight, plugins/platforms/google_chat
- tools/browser_supervisor.py (3), browser_cdp_tool.py,
  computer_use/cua_backend.py, slash_confirm.py
- tools/environments/modal.py (_AsyncWorker)
- tools/mcp_tool.py (2 + 8 _run_on_mcp_loop callers converted to
  factory-style so the coroutine is never constructed on a dead loop)
- tui_gateway/ws.py

Tests: new tests/agent/test_async_utils.py covers helper behavior under
live loop, dead loop, None loop, and scheduling exceptions. Regression
tests added at three PR-original sites (acp events, acp permissions,
mcp loop runner) mirroring contributor's intent.

Live-tested end-to-end:
- Helper stress test: 1500 schedules across live/dead/race scenarios,
  zero leaked coroutines
- Race exercised: 5000 schedules with loop killed mid-flight, 100 ok /
  4900 None returns, zero leaks
- hermes chat -q with terminal tool call (exercises step_callback bridge)
- MCP probe against failing subprocess servers + factory path
- Real gateway daemon boot + SIGINT shutdown across multiple platform
  adapter inits
- WSTransport 100 live + 50 dead-loop writes
- Cron delivery path live + dead loop

Salvages PR #2657 — adopts contributor's intent over a much wider site
list and a single centralized helper instead of inline try/except at
each site. 3 of the original PR's 6 sites no longer exist on main
(environments/patches.py deleted, DingTalk refactored to native async);
the equivalent fix lives in tools/environments/modal.py instead.

Co-authored-by: JithendraNara <jithendranaidunara@gmail.com>
This commit is contained in:
Teknium 2026-05-15 14:00:01 -07:00 committed by GitHub
parent 931caf2b2d
commit 4e89c53082
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 690 additions and 186 deletions

View file

@ -69,7 +69,8 @@ class TestProbeMcpServerTools:
patch("tools.mcp_tool._stop_mcp_loop"):
# Simulate running the async probe
def run_coro(coro, timeout=120):
def run_coro(coro_or_factory, timeout=120):
coro = coro_or_factory() if callable(coro_or_factory) else coro_or_factory
loop = asyncio.new_event_loop()
try:
return loop.run_until_complete(coro)
@ -110,7 +111,8 @@ class TestProbeMcpServerTools:
patch("tools.mcp_tool._run_on_mcp_loop") as mock_run, \
patch("tools.mcp_tool._stop_mcp_loop"):
def run_coro(coro, timeout=120):
def run_coro(coro_or_factory, timeout=120):
coro = coro_or_factory() if callable(coro_or_factory) else coro_or_factory
loop = asyncio.new_event_loop()
try:
return loop.run_until_complete(coro)
@ -144,7 +146,8 @@ class TestProbeMcpServerTools:
patch("tools.mcp_tool._run_on_mcp_loop") as mock_run, \
patch("tools.mcp_tool._stop_mcp_loop"):
def run_coro(coro, timeout=120):
def run_coro(coro_or_factory, timeout=120):
coro = coro_or_factory() if callable(coro_or_factory) else coro_or_factory
loop = asyncio.new_event_loop()
try:
return loop.run_until_complete(coro)
@ -198,7 +201,8 @@ class TestProbeMcpServerTools:
patch("tools.mcp_tool._run_on_mcp_loop") as mock_run, \
patch("tools.mcp_tool._stop_mcp_loop"):
def run_coro(coro, timeout=120):
def run_coro(coro_or_factory, timeout=120):
coro = coro_or_factory() if callable(coro_or_factory) else coro_or_factory
loop = asyncio.new_event_loop()
try:
return loop.run_until_complete(coro)

View file

@ -31,7 +31,8 @@ class _FakeCallToolResult:
self.structuredContent = structuredContent
def _fake_run_on_mcp_loop(coro, timeout=30):
def _fake_run_on_mcp_loop(coro_or_factory, timeout=30):
coro = coro_or_factory() if callable(coro_or_factory) else coro_or_factory
"""Run an MCP coroutine directly in a fresh event loop."""
loop = asyncio.new_event_loop()
try:

View file

@ -397,6 +397,77 @@ class TestCheckFunction:
_servers.pop("test_server", None)
# ---------------------------------------------------------------------------
# MCP loop runner
# ---------------------------------------------------------------------------
class TestRunOnMcpLoop:
def test_scheduler_failure_closes_factory_coroutine(self):
"""If run_coroutine_threadsafe raises, the factory's coroutine is closed."""
import gc
import warnings
import tools.mcp_tool as mcp
created = {"coro": None}
async def _sample():
return "ok"
def factory():
created["coro"] = _sample()
return created["coro"]
fake_loop = MagicMock()
fake_loop.is_running.return_value = True
with patch.object(mcp, "_mcp_loop", fake_loop):
with warnings.catch_warnings(record=True) as caught:
warnings.simplefilter("always")
with patch(
"agent.async_utils.asyncio.run_coroutine_threadsafe",
side_effect=RuntimeError("scheduler down"),
):
with pytest.raises(RuntimeError):
mcp._run_on_mcp_loop(factory)
gc.collect()
assert created["coro"] is not None
assert created["coro"].cr_frame is None
runtime_warnings = [
w for w in caught
if issubclass(w.category, RuntimeWarning)
and "was never awaited" in str(w.message)
and "_sample" in str(w.message)
]
assert runtime_warnings == []
def test_dead_loop_closes_passed_coroutine(self):
"""If loop is None, a passed coroutine (not factory) is closed."""
import gc
import warnings
import tools.mcp_tool as mcp
async def _sample():
return "ok"
coro = _sample()
with patch.object(mcp, "_mcp_loop", None):
with warnings.catch_warnings(record=True) as caught:
warnings.simplefilter("always")
with pytest.raises(RuntimeError, match="not running"):
mcp._run_on_mcp_loop(coro)
gc.collect()
assert coro.cr_frame is None
runtime_warnings = [
w for w in caught
if issubclass(w.category, RuntimeWarning)
and "was never awaited" in str(w.message)
and "_sample" in str(w.message)
]
assert runtime_warnings == []
# ---------------------------------------------------------------------------
# Tool handler
# ---------------------------------------------------------------------------
@ -406,7 +477,8 @@ class TestToolHandler:
def _patch_mcp_loop(self, coro_side_effect=None):
"""Return a patch for _run_on_mcp_loop that runs the coroutine directly."""
def fake_run(coro, timeout=30):
def fake_run(coro_or_factory, timeout=30):
coro = coro_or_factory() if callable(coro_or_factory) else coro_or_factory
return asyncio.run(coro)
if coro_side_effect:
return patch("tools.mcp_tool._run_on_mcp_loop", side_effect=coro_side_effect)
@ -485,7 +557,8 @@ class TestToolHandler:
try:
handler = _make_tool_handler("test_srv", "greet", 120)
def _interrupting_run(coro, timeout=30):
def _interrupting_run(coro_or_factory, timeout=30):
coro = coro_or_factory() if callable(coro_or_factory) else coro_or_factory
coro.close()
raise InterruptedError("User sent a new message")
with patch(
@ -1792,7 +1865,8 @@ class TestUtilityHandlers:
def _patch_mcp_loop(self):
"""Return a patch for _run_on_mcp_loop that runs the coroutine directly."""
def fake_run(coro, timeout=30):
def fake_run(coro_or_factory, timeout=30):
coro = coro_or_factory() if callable(coro_or_factory) else coro_or_factory
return asyncio.run(coro)
return patch("tools.mcp_tool._run_on_mcp_loop", side_effect=fake_run)