fix(gateway): exclude _run_restart from _background_tasks to prevent zombie on /restart

When request_restart() adds _run_restart to _background_tasks, _stop_impl
later cancels all entries in that set.  Since _run_restart is awaiting
_stop_task at that point, the CancelledError propagates into _stop_impl,
interrupting cleanup before _shutdown_event.set() and _exit_code = 75
execute.  This leaves the gateway as a zombie (alive but disconnected) or
exiting with code 0 instead of 75, preventing systemd Restart=on-failure
from restarting the service.

Fix: don't add _run_restart to _background_tasks — it self-terminates in
~50ms and needs no lifecycle management.

Fixes #12875
This commit is contained in:
zeapsu 2026-04-20 12:53:13 -07:00 committed by Teknium
parent 08e131f77c
commit 1ce5d6d974
2 changed files with 20 additions and 7 deletions

View file

@ -5520,9 +5520,13 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
await asyncio.sleep(0.05)
await self.stop(restart=True, detached_restart=detached, service_restart=via_service)
task = asyncio.create_task(_run_restart())
self._background_tasks.add(task)
task.add_done_callback(self._background_tasks.discard)
# _run_restart is a short-lived self-terminating task (calls stop()
# then returns). Don't add it to _background_tasks — _stop_impl
# cancels all entries in that set, which would cancel _run_restart
# while it's awaiting _stop_task, propagating CancelledError into
# _stop_impl and preventing _shutdown_event.set() / _exit_code = 75.
# See #12875.
asyncio.create_task(_run_restart())
return True
# Drain-timeout reasons set by _stop_impl() when a still-running turn is

View file

@ -181,11 +181,20 @@ async def test_request_restart_is_idempotent():
runner, _adapter = make_restart_runner()
runner.stop = AsyncMock()
assert runner.request_restart(detached=True, via_service=False) is True
first_task = next(iter(runner._background_tasks))
assert runner.request_restart(detached=True, via_service=False) is False
# Patch create_task to capture the restart task (it's no longer in
# _background_tasks — see #12875).
_captured = []
_orig_create_task = asyncio.create_task
def _capture(coro, **kw):
t = _orig_create_task(coro, **kw)
_captured.append(t)
return t
with pytest.MonkeyPatch.context() as mp:
mp.setattr(asyncio, "create_task", _capture)
assert runner.request_restart(detached=True, via_service=False) is True
assert runner.request_restart(detached=True, via_service=False) is False
await first_task
await _captured[0]
runner.stop.assert_awaited_once_with(
restart=True, detached_restart=True, service_restart=False