mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
fix(gateway): hold _run_restart on _restart_task + explicit cancel-loop skip
Follow-up on the cherry-picked #13173 fix. Holds the _run_restart task in self._restart_task (a bare asyncio.create_task keeps only a weak reference, so a still-pending task can be GC'd mid-flight) and explicitly skips it in the _stop_impl cancel loop alongside _stop_task. Adds AUTHOR_MAP entry for the contributor and a regression test that fails when the task is cancellable. Refs #12875
This commit is contained in:
parent
1ce5d6d974
commit
f2ca3e3d84
3 changed files with 67 additions and 14 deletions
|
|
@ -2559,6 +2559,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
_restart_via_service: bool = False
|
||||
_restart_command_source: Optional[SessionSource] = None
|
||||
_stop_task: Optional[asyncio.Task] = None
|
||||
_restart_task: Optional[asyncio.Task] = None
|
||||
_session_model_overrides: Dict[str, Dict[str, str]] = {}
|
||||
_session_reasoning_overrides: Dict[str, Dict[str, Any]] = {}
|
||||
_startup_restore_in_progress: bool = False
|
||||
|
|
@ -2639,6 +2640,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
self._restart_via_service = False
|
||||
self._restart_command_source: Optional[SessionSource] = None
|
||||
self._stop_task: Optional[asyncio.Task] = None
|
||||
self._restart_task: Optional[asyncio.Task] = None
|
||||
|
||||
# Track running agents per session for interrupt support
|
||||
# Key: session_key, Value: AIAgent instance
|
||||
|
|
@ -5526,7 +5528,13 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
# while it's awaiting _stop_task, propagating CancelledError into
|
||||
# _stop_impl and preventing _shutdown_event.set() / _exit_code = 75.
|
||||
# See #12875.
|
||||
asyncio.create_task(_run_restart())
|
||||
#
|
||||
# We still hold a strong reference in self._restart_task: a bare
|
||||
# asyncio.create_task() keeps only a weak reference, so the event
|
||||
# loop may garbage-collect a still-pending task mid-flight. The
|
||||
# cancel loop in _stop_impl explicitly skips _restart_task for the
|
||||
# same reason it skips _stop_task.
|
||||
self._restart_task = asyncio.create_task(_run_restart())
|
||||
return True
|
||||
|
||||
# Drain-timeout reasons set by _stop_impl() when a still-running turn is
|
||||
|
|
@ -7234,6 +7242,12 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
for _task in list(self._background_tasks):
|
||||
if _task is self._stop_task:
|
||||
continue
|
||||
if _task is self._restart_task:
|
||||
# The restart orchestration task is awaiting _stop_task
|
||||
# right now; cancelling it would propagate CancelledError
|
||||
# into this _stop_impl and skip _shutdown_event.set() /
|
||||
# _exit_code = 75 (#12875). It self-terminates anyway.
|
||||
continue
|
||||
_task.cancel()
|
||||
self._background_tasks.clear()
|
||||
|
||||
|
|
|
|||
|
|
@ -153,6 +153,7 @@ AUTHOR_MAP = {
|
|||
"290859878+synapsesx@users.noreply.github.com": "synapsesx",
|
||||
"157689911+itsflownium@users.noreply.github.com": "itsflownium",
|
||||
"dirtyren@users.noreply.github.com": "dirtyren",
|
||||
"--email": "andryypaez@gmail.com",
|
||||
"mucio@mucio.net": "francescomucio",
|
||||
"291572938+thestral123@users.noreply.github.com": "thestral123",
|
||||
"tkwong@inspiresynergy.com": "tkwong",
|
||||
|
|
|
|||
|
|
@ -181,26 +181,64 @@ async def test_request_restart_is_idempotent():
|
|||
runner, _adapter = make_restart_runner()
|
||||
runner.stop = AsyncMock()
|
||||
|
||||
# Patch create_task to capture the restart task (it's no longer in
|
||||
# _background_tasks — see #12875).
|
||||
_captured = []
|
||||
_orig_create_task = asyncio.create_task
|
||||
def _capture(coro, **kw):
|
||||
t = _orig_create_task(coro, **kw)
|
||||
_captured.append(t)
|
||||
return t
|
||||
with pytest.MonkeyPatch.context() as mp:
|
||||
mp.setattr(asyncio, "create_task", _capture)
|
||||
assert runner.request_restart(detached=True, via_service=False) is True
|
||||
assert runner.request_restart(detached=True, via_service=False) is False
|
||||
# _run_restart is held on self._restart_task and is intentionally NOT in
|
||||
# _background_tasks, so _stop_impl's cancel loop can't abort it mid-await
|
||||
# (see #12875).
|
||||
assert runner.request_restart(detached=True, via_service=False) is True
|
||||
assert runner._restart_task is not None
|
||||
assert runner._restart_task not in runner._background_tasks
|
||||
assert runner.request_restart(detached=True, via_service=False) is False
|
||||
|
||||
await _captured[0]
|
||||
await runner._restart_task
|
||||
|
||||
runner.stop.assert_awaited_once_with(
|
||||
restart=True, detached_restart=True, service_restart=False
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_restart_excluded_from_stop_cancel_loop():
|
||||
"""Regression for #12875: _run_restart is held on self._restart_task and
|
||||
kept OUT of _background_tasks, and the _stop_impl cancel loop explicitly
|
||||
skips it. If it were in _background_tasks, the cancel loop (which fires
|
||||
while _run_restart is awaiting _stop_task) would propagate CancelledError
|
||||
into _stop_impl and skip _shutdown_event.set() / _exit_code = 75."""
|
||||
runner, _adapter = make_restart_runner()
|
||||
runner.stop = AsyncMock()
|
||||
|
||||
# A decoy background task that SHOULD be cancelled, plus the restart task
|
||||
# that must NOT be.
|
||||
async def _decoy():
|
||||
await asyncio.sleep(60)
|
||||
|
||||
decoy = asyncio.create_task(_decoy())
|
||||
runner._background_tasks.add(decoy)
|
||||
decoy.add_done_callback(runner._background_tasks.discard)
|
||||
|
||||
assert runner.request_restart(detached=False, via_service=True) is True
|
||||
restart_task = runner._restart_task
|
||||
assert restart_task is not None
|
||||
assert restart_task not in runner._background_tasks
|
||||
|
||||
# Run the real cancel loop body in isolation (mirrors _stop_impl:7234).
|
||||
runner._stop_task = None
|
||||
for _task in list(runner._background_tasks):
|
||||
if _task is runner._stop_task:
|
||||
continue
|
||||
if _task is runner._restart_task:
|
||||
continue
|
||||
_task.cancel()
|
||||
|
||||
await asyncio.sleep(0) # let cancellation settle
|
||||
assert decoy.cancelled()
|
||||
assert not restart_task.cancelled()
|
||||
|
||||
await restart_task
|
||||
runner.stop.assert_awaited_once_with(
|
||||
restart=True, detached_restart=False, service_restart=True
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_launch_detached_restart_command_uses_setsid(monkeypatch):
|
||||
runner, _adapter = make_restart_runner()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue