mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
fix(gateway): exclude _run_restart from _background_tasks to prevent zombie on /restart
When request_restart() adds _run_restart to _background_tasks, _stop_impl later cancels all entries in that set. Since _run_restart is awaiting _stop_task at that point, the CancelledError propagates into _stop_impl, interrupting cleanup before _shutdown_event.set() and _exit_code = 75 execute. This leaves the gateway as a zombie (alive but disconnected) or exiting with code 0 instead of 75, preventing systemd Restart=on-failure from restarting the service. Fix: don't add _run_restart to _background_tasks — it self-terminates in ~50ms and needs no lifecycle management. Fixes #12875
This commit is contained in:
parent
08e131f77c
commit
1ce5d6d974
2 changed files with 20 additions and 7 deletions
|
|
@ -5520,9 +5520,13 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
await asyncio.sleep(0.05)
|
||||
await self.stop(restart=True, detached_restart=detached, service_restart=via_service)
|
||||
|
||||
task = asyncio.create_task(_run_restart())
|
||||
self._background_tasks.add(task)
|
||||
task.add_done_callback(self._background_tasks.discard)
|
||||
# _run_restart is a short-lived self-terminating task (calls stop()
|
||||
# then returns). Don't add it to _background_tasks — _stop_impl
|
||||
# cancels all entries in that set, which would cancel _run_restart
|
||||
# while it's awaiting _stop_task, propagating CancelledError into
|
||||
# _stop_impl and preventing _shutdown_event.set() / _exit_code = 75.
|
||||
# See #12875.
|
||||
asyncio.create_task(_run_restart())
|
||||
return True
|
||||
|
||||
# Drain-timeout reasons set by _stop_impl() when a still-running turn is
|
||||
|
|
|
|||
|
|
@ -181,11 +181,20 @@ async def test_request_restart_is_idempotent():
|
|||
runner, _adapter = make_restart_runner()
|
||||
runner.stop = AsyncMock()
|
||||
|
||||
assert runner.request_restart(detached=True, via_service=False) is True
|
||||
first_task = next(iter(runner._background_tasks))
|
||||
assert runner.request_restart(detached=True, via_service=False) is False
|
||||
# Patch create_task to capture the restart task (it's no longer in
|
||||
# _background_tasks — see #12875).
|
||||
_captured = []
|
||||
_orig_create_task = asyncio.create_task
|
||||
def _capture(coro, **kw):
|
||||
t = _orig_create_task(coro, **kw)
|
||||
_captured.append(t)
|
||||
return t
|
||||
with pytest.MonkeyPatch.context() as mp:
|
||||
mp.setattr(asyncio, "create_task", _capture)
|
||||
assert runner.request_restart(detached=True, via_service=False) is True
|
||||
assert runner.request_restart(detached=True, via_service=False) is False
|
||||
|
||||
await first_task
|
||||
await _captured[0]
|
||||
|
||||
runner.stop.assert_awaited_once_with(
|
||||
restart=True, detached_restart=True, service_restart=False
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue