mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-19 04:52:06 +00:00
fix(async): close unscheduled coroutines in all threadsafe bridges (#26584)
Wraps every sync->async coroutine-scheduling site in the codebase with a new agent.async_utils.safe_schedule_threadsafe() helper that closes the coroutine on scheduling failure (closed loop, shutdown race, etc.) instead of leaking it as 'coroutine was never awaited' RuntimeWarnings plus reference leaks. 22 production call sites migrated across the codebase: - acp_adapter/events.py, acp_adapter/permissions.py - agent/lsp/manager.py - cron/scheduler.py (media + text delivery paths) - gateway/platforms/feishu.py (5 sites, via existing _submit_on_loop helper which now delegates to safe_schedule_threadsafe) - gateway/run.py (10 sites: telegram rename, agent:step hook, status callback, interim+bg-review, clarify send, exec-approval button+text, temp-bubble cleanup, channel-directory refresh) - plugins/memory/hindsight, plugins/platforms/google_chat - tools/browser_supervisor.py (3), browser_cdp_tool.py, computer_use/cua_backend.py, slash_confirm.py - tools/environments/modal.py (_AsyncWorker) - tools/mcp_tool.py (2 + 8 _run_on_mcp_loop callers converted to factory-style so the coroutine is never constructed on a dead loop) - tui_gateway/ws.py Tests: new tests/agent/test_async_utils.py covers helper behavior under live loop, dead loop, None loop, and scheduling exceptions. Regression tests added at three PR-original sites (acp events, acp permissions, mcp loop runner) mirroring contributor's intent. Live-tested end-to-end: - Helper stress test: 1500 schedules across live/dead/race scenarios, zero leaked coroutines - Race exercised: 5000 schedules with loop killed mid-flight, 100 ok / 4900 None returns, zero leaks - hermes chat -q with terminal tool call (exercises step_callback bridge) - MCP probe against failing subprocess servers + factory path - Real gateway daemon boot + SIGINT shutdown across multiple platform adapter inits - WSTransport 100 live + 50 dead-loop writes - Cron delivery path live + dead loop Salvages PR #2657 — adopts contributor's intent over a much wider site list and a single centralized helper instead of inline try/except at each site. 3 of the original PR's 6 sites no longer exist on main (environments/patches.py deleted, DingTalk refactored to native async); the equivalent fix lives in tools/environments/modal.py instead. Co-authored-by: JithendraNara <jithendranaidunara@gmail.com>
This commit is contained in:
parent
931caf2b2d
commit
4e89c53082
23 changed files with 690 additions and 186 deletions
|
|
@ -2273,11 +2273,7 @@ class FeishuAdapter(BasePlatformAdapter):
|
|||
daemon=True,
|
||||
).start()
|
||||
return
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
self._handle_message_event_data(data),
|
||||
loop,
|
||||
)
|
||||
future.add_done_callback(self._log_background_failure)
|
||||
self._submit_on_loop(loop, self._handle_message_event_data(data))
|
||||
|
||||
def _enqueue_pending_inbound_event(self, data: Any) -> bool:
|
||||
"""Append an event to the pending-inbound queue.
|
||||
|
|
@ -2353,16 +2349,12 @@ class FeishuAdapter(BasePlatformAdapter):
|
|||
dispatched = 0
|
||||
requeue: List[Any] = []
|
||||
for event in batch:
|
||||
try:
|
||||
fut = asyncio.run_coroutine_threadsafe(
|
||||
self._handle_message_event_data(event),
|
||||
loop,
|
||||
)
|
||||
fut.add_done_callback(self._log_background_failure)
|
||||
if self._submit_on_loop(
|
||||
loop, self._handle_message_event_data(event)
|
||||
):
|
||||
dispatched += 1
|
||||
except RuntimeError:
|
||||
# Loop closed between check and submit — requeue
|
||||
# and poll again.
|
||||
else:
|
||||
# Loop closed/unavailable — requeue and poll again.
|
||||
requeue.append(event)
|
||||
if requeue:
|
||||
with self._pending_inbound_lock:
|
||||
|
|
@ -2466,11 +2458,10 @@ class FeishuAdapter(BasePlatformAdapter):
|
|||
if not self._loop_accepts_callbacks(loop):
|
||||
logger.warning("[Feishu] Dropping drive comment event before adapter loop is ready")
|
||||
return
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
handle_drive_comment_event(self._client, data, self_open_id=self._bot_open_id),
|
||||
self._submit_on_loop(
|
||||
loop,
|
||||
handle_drive_comment_event(self._client, data, self_open_id=self._bot_open_id),
|
||||
)
|
||||
future.add_done_callback(self._log_background_failure)
|
||||
|
||||
def _on_reaction_event(self, event_type: str, data: Any) -> None:
|
||||
"""Route user reactions on bot messages as synthetic text events."""
|
||||
|
|
@ -2498,11 +2489,7 @@ class FeishuAdapter(BasePlatformAdapter):
|
|||
or bool(getattr(loop, "is_closed", lambda: False)())
|
||||
):
|
||||
return
|
||||
future = asyncio.run_coroutine_threadsafe(
|
||||
self._handle_reaction_event(event_type, data),
|
||||
loop,
|
||||
)
|
||||
future.add_done_callback(self._log_background_failure)
|
||||
self._submit_on_loop(loop, self._handle_reaction_event(event_type, data))
|
||||
|
||||
def _on_card_action_trigger(self, data: Any) -> Any:
|
||||
"""Handle card-action callback from the Feishu SDK (synchronous).
|
||||
|
|
@ -2548,11 +2535,14 @@ class FeishuAdapter(BasePlatformAdapter):
|
|||
|
||||
def _submit_on_loop(self, loop: Any, coro: Any) -> bool:
|
||||
"""Schedule background work on the adapter loop with shared failure logging."""
|
||||
try:
|
||||
future = asyncio.run_coroutine_threadsafe(coro, loop)
|
||||
except Exception:
|
||||
coro.close()
|
||||
logger.warning("[Feishu] Failed to schedule background callback work", exc_info=True)
|
||||
from agent.async_utils import safe_schedule_threadsafe
|
||||
future = safe_schedule_threadsafe(
|
||||
coro, loop,
|
||||
logger=logger,
|
||||
log_message="[Feishu] Failed to schedule background callback work",
|
||||
log_level=logging.WARNING,
|
||||
)
|
||||
if future is None:
|
||||
return False
|
||||
future.add_done_callback(self._log_background_failure)
|
||||
return True
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue