mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
fix(async): close unscheduled coroutines in all threadsafe bridges (#26584)
Wraps every sync->async coroutine-scheduling site in the codebase with a new agent.async_utils.safe_schedule_threadsafe() helper that closes the coroutine on scheduling failure (closed loop, shutdown race, etc.) instead of leaking it as 'coroutine was never awaited' RuntimeWarnings plus reference leaks. 22 production call sites migrated across the codebase: - acp_adapter/events.py, acp_adapter/permissions.py - agent/lsp/manager.py - cron/scheduler.py (media + text delivery paths) - gateway/platforms/feishu.py (5 sites, via existing _submit_on_loop helper which now delegates to safe_schedule_threadsafe) - gateway/run.py (10 sites: telegram rename, agent:step hook, status callback, interim+bg-review, clarify send, exec-approval button+text, temp-bubble cleanup, channel-directory refresh) - plugins/memory/hindsight, plugins/platforms/google_chat - tools/browser_supervisor.py (3), browser_cdp_tool.py, computer_use/cua_backend.py, slash_confirm.py - tools/environments/modal.py (_AsyncWorker) - tools/mcp_tool.py (2 + 8 _run_on_mcp_loop callers converted to factory-style so the coroutine is never constructed on a dead loop) - tui_gateway/ws.py Tests: new tests/agent/test_async_utils.py covers helper behavior under live loop, dead loop, None loop, and scheduling exceptions. Regression tests added at three PR-original sites (acp events, acp permissions, mcp loop runner) mirroring contributor's intent. Live-tested end-to-end: - Helper stress test: 1500 schedules across live/dead/race scenarios, zero leaked coroutines - Race exercised: 5000 schedules with loop killed mid-flight, 100 ok / 4900 None returns, zero leaks - hermes chat -q with terminal tool call (exercises step_callback bridge) - MCP probe against failing subprocess servers + factory path - Real gateway daemon boot + SIGINT shutdown across multiple platform adapter inits - WSTransport 100 live + 50 dead-loop writes - Cron delivery path live + dead loop Salvages PR #2657 — adopts contributor's intent over a much wider site list and a single centralized helper instead of inline try/except at each site. 3 of the original PR's 6 sites no longer exist on main (environments/patches.py deleted, DingTalk refactored to native async); the equivalent fix lives in tools/environments/modal.py instead. Co-authored-by: JithendraNara <jithendranaidunara@gmail.com>
This commit is contained in:
parent
931caf2b2d
commit
4e89c53082
23 changed files with 690 additions and 186 deletions
|
|
@ -274,7 +274,13 @@ def _browser_cdp_via_supervisor(
|
|||
)
|
||||
|
||||
try:
|
||||
fut = _asyncio.run_coroutine_threadsafe(_do_cdp(), loop)
|
||||
from agent.async_utils import safe_schedule_threadsafe
|
||||
fut = safe_schedule_threadsafe(_do_cdp(), loop)
|
||||
if fut is None:
|
||||
return tool_error(
|
||||
"CDP call via supervisor failed: loop unavailable",
|
||||
cdp_docs=CDP_DOCS_URL,
|
||||
)
|
||||
result_msg = fut.result(timeout=timeout + 2)
|
||||
except Exception as exc:
|
||||
return tool_error(
|
||||
|
|
|
|||
|
|
@ -368,11 +368,13 @@ class CDPSupervisor:
|
|||
pass
|
||||
|
||||
try:
|
||||
fut = asyncio.run_coroutine_threadsafe(_close_ws(), loop)
|
||||
try:
|
||||
fut.result(timeout=2.0)
|
||||
except Exception:
|
||||
pass
|
||||
from agent.async_utils import safe_schedule_threadsafe
|
||||
fut = safe_schedule_threadsafe(_close_ws(), loop)
|
||||
if fut is not None:
|
||||
try:
|
||||
fut.result(timeout=2.0)
|
||||
except Exception:
|
||||
pass
|
||||
except RuntimeError:
|
||||
pass # loop already shutting down
|
||||
if self._thread is not None:
|
||||
|
|
@ -451,7 +453,10 @@ class CDPSupervisor:
|
|||
)
|
||||
|
||||
try:
|
||||
fut = asyncio.run_coroutine_threadsafe(_do_respond(), loop)
|
||||
from agent.async_utils import safe_schedule_threadsafe
|
||||
fut = safe_schedule_threadsafe(_do_respond(), loop)
|
||||
if fut is None:
|
||||
return {"ok": False, "error": "Browser supervisor loop unavailable"}
|
||||
fut.result(timeout=timeout)
|
||||
except Exception as e:
|
||||
return {"ok": False, "error": f"{type(e).__name__}: {e}"}
|
||||
|
|
@ -507,7 +512,10 @@ class CDPSupervisor:
|
|||
)
|
||||
|
||||
try:
|
||||
fut = asyncio.run_coroutine_threadsafe(_do_eval(), loop)
|
||||
from agent.async_utils import safe_schedule_threadsafe
|
||||
fut = safe_schedule_threadsafe(_do_eval(), loop)
|
||||
if fut is None:
|
||||
return {"ok": False, "error": "Browser supervisor loop unavailable"}
|
||||
response = fut.result(timeout=timeout + 1)
|
||||
except Exception as exc:
|
||||
return {"ok": False, "error": f"{type(exc).__name__}: {exc}"}
|
||||
|
|
|
|||
|
|
@ -183,9 +183,14 @@ class _AsyncBridge:
|
|||
raise RuntimeError("cua-driver asyncio bridge failed to start")
|
||||
|
||||
def run(self, coro, timeout: Optional[float] = 30.0) -> Any:
|
||||
from agent.async_utils import safe_schedule_threadsafe
|
||||
if not self._loop or not self._thread or not self._thread.is_alive():
|
||||
if asyncio.iscoroutine(coro):
|
||||
coro.close()
|
||||
raise RuntimeError("cua-driver bridge not started")
|
||||
fut = safe_schedule_threadsafe(coro, self._loop)
|
||||
if fut is None:
|
||||
raise RuntimeError("cua-driver bridge not started")
|
||||
fut: Future = asyncio.run_coroutine_threadsafe(coro, self._loop)
|
||||
return fut.result(timeout=timeout)
|
||||
|
||||
def stop(self) -> None:
|
||||
|
|
|
|||
|
|
@ -144,9 +144,14 @@ class _AsyncWorker:
|
|||
self._loop.run_forever()
|
||||
|
||||
def run_coroutine(self, coro, timeout=600):
|
||||
from agent.async_utils import safe_schedule_threadsafe
|
||||
if self._loop is None or self._loop.is_closed():
|
||||
if asyncio.iscoroutine(coro):
|
||||
coro.close()
|
||||
raise RuntimeError("AsyncWorker loop is not running")
|
||||
future = safe_schedule_threadsafe(coro, self._loop)
|
||||
if future is None:
|
||||
raise RuntimeError("AsyncWorker loop is not running")
|
||||
future = asyncio.run_coroutine_threadsafe(coro, self._loop)
|
||||
return future.result(timeout=timeout)
|
||||
|
||||
def stop(self):
|
||||
|
|
|
|||
|
|
@ -1781,7 +1781,7 @@ def _handle_auth_error_and_retry(
|
|||
return await manager.handle_401(server_name, None)
|
||||
|
||||
try:
|
||||
recovered = _run_on_mcp_loop(_recover(), timeout=10)
|
||||
recovered = _run_on_mcp_loop(_recover, timeout=10)
|
||||
except Exception as rec_exc:
|
||||
logger.warning(
|
||||
"MCP OAuth '%s': recovery attempt failed: %s",
|
||||
|
|
@ -2054,19 +2054,35 @@ def _ensure_mcp_loop():
|
|||
_mcp_thread.start()
|
||||
|
||||
|
||||
def _run_on_mcp_loop(coro, timeout: float = 30):
|
||||
def _run_on_mcp_loop(coro_or_factory, timeout: float = 30):
|
||||
"""Schedule a coroutine on the MCP event loop and block until done.
|
||||
|
||||
Accepts either a coroutine object or a zero-arg callable that returns one.
|
||||
Callers can pass a factory to avoid constructing coroutine objects when
|
||||
the MCP loop is unavailable (which would otherwise leak the coroutine
|
||||
frame and emit ``"coroutine was never awaited"`` warnings).
|
||||
|
||||
Poll in short intervals so the calling agent thread can honor user
|
||||
interrupts while the MCP work is still running on the background loop.
|
||||
"""
|
||||
from tools.interrupt import is_interrupted
|
||||
from agent.async_utils import safe_schedule_threadsafe
|
||||
|
||||
with _lock:
|
||||
loop = _mcp_loop
|
||||
if loop is None or not loop.is_running():
|
||||
if asyncio.iscoroutine(coro_or_factory):
|
||||
coro_or_factory.close()
|
||||
raise RuntimeError("MCP event loop is not running")
|
||||
future = asyncio.run_coroutine_threadsafe(coro, loop)
|
||||
|
||||
coro = coro_or_factory() if callable(coro_or_factory) else coro_or_factory
|
||||
future = safe_schedule_threadsafe(
|
||||
coro, loop,
|
||||
logger=logger,
|
||||
log_message="MCP scheduling failed",
|
||||
)
|
||||
if future is None:
|
||||
raise RuntimeError("MCP event loop unavailable (failed to schedule)")
|
||||
start_time = time.monotonic()
|
||||
deadline = None if timeout is None else start_time + timeout
|
||||
|
||||
|
|
@ -2263,7 +2279,7 @@ def _make_tool_handler(server_name: str, tool_name: str, tool_timeout: float):
|
|||
return json.dumps({"result": text_result}, ensure_ascii=False)
|
||||
|
||||
def _call_once():
|
||||
return _run_on_mcp_loop(_call(), timeout=tool_timeout)
|
||||
return _run_on_mcp_loop(_call, timeout=tool_timeout)
|
||||
|
||||
try:
|
||||
result = _call_once()
|
||||
|
|
@ -2343,7 +2359,7 @@ def _make_list_resources_handler(server_name: str, tool_timeout: float):
|
|||
return json.dumps({"resources": resources}, ensure_ascii=False)
|
||||
|
||||
def _call_once():
|
||||
return _run_on_mcp_loop(_call(), timeout=tool_timeout)
|
||||
return _run_on_mcp_loop(_call, timeout=tool_timeout)
|
||||
|
||||
try:
|
||||
return _call_once()
|
||||
|
|
@ -2403,7 +2419,7 @@ def _make_read_resource_handler(server_name: str, tool_timeout: float):
|
|||
return json.dumps({"result": "\n".join(parts) if parts else ""}, ensure_ascii=False)
|
||||
|
||||
def _call_once():
|
||||
return _run_on_mcp_loop(_call(), timeout=tool_timeout)
|
||||
return _run_on_mcp_loop(_call, timeout=tool_timeout)
|
||||
|
||||
try:
|
||||
return _call_once()
|
||||
|
|
@ -2466,7 +2482,7 @@ def _make_list_prompts_handler(server_name: str, tool_timeout: float):
|
|||
return json.dumps({"prompts": prompts}, ensure_ascii=False)
|
||||
|
||||
def _call_once():
|
||||
return _run_on_mcp_loop(_call(), timeout=tool_timeout)
|
||||
return _run_on_mcp_loop(_call, timeout=tool_timeout)
|
||||
|
||||
try:
|
||||
return _call_once()
|
||||
|
|
@ -2537,7 +2553,7 @@ def _make_get_prompt_handler(server_name: str, tool_timeout: float):
|
|||
return json.dumps(resp, ensure_ascii=False)
|
||||
|
||||
def _call_once():
|
||||
return _run_on_mcp_loop(_call(), timeout=tool_timeout)
|
||||
return _run_on_mcp_loop(_call, timeout=tool_timeout)
|
||||
|
||||
try:
|
||||
return _call_once()
|
||||
|
|
@ -3121,7 +3137,7 @@ def register_mcp_servers(servers: Dict[str, dict]) -> List[str]:
|
|||
if _was_interrupted:
|
||||
_set_interrupt(False)
|
||||
try:
|
||||
_run_on_mcp_loop(_discover_all(), timeout=120)
|
||||
_run_on_mcp_loop(_discover_all, timeout=120)
|
||||
finally:
|
||||
if _was_interrupted:
|
||||
_set_interrupt(True)
|
||||
|
|
@ -3289,7 +3305,7 @@ def probe_mcp_server_tools() -> Dict[str, List[tuple]]:
|
|||
)
|
||||
|
||||
try:
|
||||
_run_on_mcp_loop(_probe_all(), timeout=120)
|
||||
_run_on_mcp_loop(_probe_all, timeout=120)
|
||||
except Exception as exc:
|
||||
logger.debug("MCP probe failed: %s", exc)
|
||||
finally:
|
||||
|
|
@ -3329,11 +3345,17 @@ def shutdown_mcp_servers():
|
|||
with _lock:
|
||||
loop = _mcp_loop
|
||||
if loop is not None and loop.is_running():
|
||||
try:
|
||||
future = asyncio.run_coroutine_threadsafe(_shutdown(), loop)
|
||||
future.result(timeout=15)
|
||||
except Exception as exc:
|
||||
logger.debug("Error during MCP shutdown: %s", exc)
|
||||
from agent.async_utils import safe_schedule_threadsafe
|
||||
future = safe_schedule_threadsafe(
|
||||
_shutdown(), loop,
|
||||
logger=logger,
|
||||
log_message="MCP shutdown: failed to schedule",
|
||||
)
|
||||
if future is not None:
|
||||
try:
|
||||
future.result(timeout=15)
|
||||
except Exception as exc:
|
||||
logger.debug("Error during MCP shutdown: %s", exc)
|
||||
|
||||
_stop_mcp_loop()
|
||||
|
||||
|
|
|
|||
|
|
@ -153,9 +153,14 @@ def resolve_sync_compat(
|
|||
Prefer the async ``resolve()`` from an async context.
|
||||
"""
|
||||
try:
|
||||
fut = asyncio.run_coroutine_threadsafe(
|
||||
from agent.async_utils import safe_schedule_threadsafe
|
||||
fut = safe_schedule_threadsafe(
|
||||
resolve(session_key, confirm_id, choice), loop,
|
||||
logger=logger,
|
||||
log_message="resolve_sync_compat scheduling failed",
|
||||
)
|
||||
if fut is None:
|
||||
return None
|
||||
return fut.result(timeout=30)
|
||||
except Exception as exc:
|
||||
logger.error("resolve_sync_compat failed: %s", exc)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue