From 1ce5d6d974fef0e6089bcbd1f2320f6955ae3215 Mon Sep 17 00:00:00 2001 From: zeapsu Date: Mon, 20 Apr 2026 12:53:13 -0700 Subject: [PATCH] fix(gateway): exclude _run_restart from _background_tasks to prevent zombie on /restart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When request_restart() adds _run_restart to _background_tasks, _stop_impl later cancels all entries in that set. Since _run_restart is awaiting _stop_task at that point, the CancelledError propagates into _stop_impl, interrupting cleanup before _shutdown_event.set() and _exit_code = 75 execute. This leaves the gateway as a zombie (alive but disconnected) or exiting with code 0 instead of 75, preventing systemd Restart=on-failure from restarting the service. Fix: don't add _run_restart to _background_tasks — it self-terminates in ~50ms and needs no lifecycle management. Fixes #12875 --- gateway/run.py | 10 +++++++--- tests/gateway/test_restart_drain.py | 17 +++++++++++++---- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index e8c82a0ccd4..e2595f880aa 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -5520,9 +5520,13 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew await asyncio.sleep(0.05) await self.stop(restart=True, detached_restart=detached, service_restart=via_service) - task = asyncio.create_task(_run_restart()) - self._background_tasks.add(task) - task.add_done_callback(self._background_tasks.discard) + # _run_restart is a short-lived self-terminating task (calls stop() + # then returns). Don't add it to _background_tasks — _stop_impl + # cancels all entries in that set, which would cancel _run_restart + # while it's awaiting _stop_task, propagating CancelledError into + # _stop_impl and preventing _shutdown_event.set() / _exit_code = 75. + # See #12875. + asyncio.create_task(_run_restart()) return True # Drain-timeout reasons set by _stop_impl() when a still-running turn is diff --git a/tests/gateway/test_restart_drain.py b/tests/gateway/test_restart_drain.py index 15b948a4f79..07077539b47 100644 --- a/tests/gateway/test_restart_drain.py +++ b/tests/gateway/test_restart_drain.py @@ -181,11 +181,20 @@ async def test_request_restart_is_idempotent(): runner, _adapter = make_restart_runner() runner.stop = AsyncMock() - assert runner.request_restart(detached=True, via_service=False) is True - first_task = next(iter(runner._background_tasks)) - assert runner.request_restart(detached=True, via_service=False) is False + # Patch create_task to capture the restart task (it's no longer in + # _background_tasks — see #12875). + _captured = [] + _orig_create_task = asyncio.create_task + def _capture(coro, **kw): + t = _orig_create_task(coro, **kw) + _captured.append(t) + return t + with pytest.MonkeyPatch.context() as mp: + mp.setattr(asyncio, "create_task", _capture) + assert runner.request_restart(detached=True, via_service=False) is True + assert runner.request_restart(detached=True, via_service=False) is False - await first_task + await _captured[0] runner.stop.assert_awaited_once_with( restart=True, detached_restart=True, service_restart=False