mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-04 02:21:47 +00:00
fix(gateway): drain pending messages via fresh task, not recursion (#17758)
`_process_message_background` finished a turn, found a queued follow-up, and drained it via `await self._process_message_background(pending_event, session_key)`. Each chained follow-up added a frame to the call stack instead of starting fresh. Under sustained pending-queue activity (e.g. a user sending follow-ups faster than the agent finishes turns) the C stack would exhaust at ~2000 nested frames and SIGSEGV the process. Mirror the late-arrival drain pattern that already exists in the same function: spawn a new `asyncio.create_task(...)` for the pending event and return so the current frame can unwind. The new task takes ownership via `_session_tasks[session_key]`. The late-arrival drain in `finally` could now race with the in-band drain across the `await typing_task` / `await stop_typing` window, so add a guard: if `_session_tasks[session_key]` is no longer the current task, an in-band drain already spawned a follow-up task — re-queue the late-arrival event so that task picks it up after its current event, instead of spawning a second concurrent task for the same session_key. Regression test (`test_pending_drain_no_recursion.py`) chains 12 follow-ups and asserts the recorded `_process_message_background` stack depth stays bounded at handler entry. Pre-fix: depths grow linearly `[1,2,3,…,12]`. Post-fix: all depths are `1`. `test_duplicate_reply_suppression::test_stale_response_suppressed_when_interrupted` called `_process_message_background` directly and implicitly relied on the old recursive `await` semantic — updated to wait for the spawned drain task before checking the sent list. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
cb130bf776
commit
663ba9a58f
3 changed files with 194 additions and 22 deletions
|
|
@ -2708,9 +2708,27 @@ class BasePlatformAdapter(ABC):
|
|||
if _active is not None:
|
||||
_active.clear()
|
||||
await _stop_typing_task()
|
||||
# Process pending message in new background task
|
||||
await self._process_message_background(pending_event, session_key)
|
||||
return # Already cleaned up
|
||||
# Spawn a fresh task for the pending message instead of
|
||||
# recursing. Issue #17758: `await
|
||||
# self._process_message_background(...)` here grew the
|
||||
# call stack one frame per chained follow-up, and under
|
||||
# sustained pending-queue activity the C stack would
|
||||
# exhaust at ~2000 frames and SIGSEGV the process.
|
||||
# Mirror the late-arrival drain pattern below: hand off
|
||||
# to a new task and return so this frame can unwind.
|
||||
drain_task = asyncio.create_task(
|
||||
self._process_message_background(pending_event, session_key)
|
||||
)
|
||||
# Hand ownership of the session to the drain task so
|
||||
# stale-lock detection keeps working while it runs.
|
||||
self._session_tasks[session_key] = drain_task
|
||||
try:
|
||||
self._background_tasks.add(drain_task)
|
||||
drain_task.add_done_callback(self._background_tasks.discard)
|
||||
except TypeError:
|
||||
# Tests stub create_task() with non-hashable sentinels; tolerate.
|
||||
pass
|
||||
return # Drain task owns the session now.
|
||||
|
||||
except asyncio.CancelledError:
|
||||
current_task = asyncio.current_task()
|
||||
|
|
@ -2772,25 +2790,41 @@ class BasePlatformAdapter(ABC):
|
|||
# dropped (user never gets a reply).
|
||||
late_pending = self._pending_messages.pop(session_key, None)
|
||||
if late_pending is not None:
|
||||
logger.debug(
|
||||
"[%s] Late-arrival pending message during cleanup — spawning drain task",
|
||||
self.name,
|
||||
)
|
||||
_active = self._active_sessions.get(session_key)
|
||||
if _active is not None:
|
||||
_active.clear()
|
||||
drain_task = asyncio.create_task(
|
||||
self._process_message_background(late_pending, session_key)
|
||||
)
|
||||
# Hand ownership of the session to the drain task so stale-lock
|
||||
# detection keeps working while it runs.
|
||||
self._session_tasks[session_key] = drain_task
|
||||
try:
|
||||
self._background_tasks.add(drain_task)
|
||||
drain_task.add_done_callback(self._background_tasks.discard)
|
||||
except TypeError:
|
||||
# Tests stub create_task() with non-hashable sentinels; tolerate.
|
||||
pass
|
||||
current_task = asyncio.current_task()
|
||||
existing_task = self._session_tasks.get(session_key)
|
||||
if (
|
||||
existing_task is not None
|
||||
and existing_task is not current_task
|
||||
):
|
||||
# The in-band drain (or an earlier late-arrival drain)
|
||||
# already spawned a follow-up task that owns this
|
||||
# session. Re-queue the late-arrival event so that
|
||||
# task picks it up — avoids spawning two concurrent
|
||||
# _process_message_background tasks for the same key
|
||||
# (#17758 follow-up: prevents the create_task path
|
||||
# from racing with itself across the in-band/finally
|
||||
# boundary).
|
||||
self._pending_messages[session_key] = late_pending
|
||||
else:
|
||||
logger.debug(
|
||||
"[%s] Late-arrival pending message during cleanup — spawning drain task",
|
||||
self.name,
|
||||
)
|
||||
_active = self._active_sessions.get(session_key)
|
||||
if _active is not None:
|
||||
_active.clear()
|
||||
drain_task = asyncio.create_task(
|
||||
self._process_message_background(late_pending, session_key)
|
||||
)
|
||||
# Hand ownership of the session to the drain task so stale-lock
|
||||
# detection keeps working while it runs.
|
||||
self._session_tasks[session_key] = drain_task
|
||||
try:
|
||||
self._background_tasks.add(drain_task)
|
||||
drain_task.add_done_callback(self._background_tasks.discard)
|
||||
except TypeError:
|
||||
# Tests stub create_task() with non-hashable sentinels; tolerate.
|
||||
pass
|
||||
# Leave _active_sessions[session_key] populated — the drain
|
||||
# task's own lifecycle will clean it up.
|
||||
else:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue