diff --git a/gateway/run.py b/gateway/run.py index e2595f880aa..84a5c61acdb 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2559,6 +2559,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew _restart_via_service: bool = False _restart_command_source: Optional[SessionSource] = None _stop_task: Optional[asyncio.Task] = None + _restart_task: Optional[asyncio.Task] = None _session_model_overrides: Dict[str, Dict[str, str]] = {} _session_reasoning_overrides: Dict[str, Dict[str, Any]] = {} _startup_restore_in_progress: bool = False @@ -2639,6 +2640,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew self._restart_via_service = False self._restart_command_source: Optional[SessionSource] = None self._stop_task: Optional[asyncio.Task] = None + self._restart_task: Optional[asyncio.Task] = None # Track running agents per session for interrupt support # Key: session_key, Value: AIAgent instance @@ -5526,7 +5528,13 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew # while it's awaiting _stop_task, propagating CancelledError into # _stop_impl and preventing _shutdown_event.set() / _exit_code = 75. # See #12875. - asyncio.create_task(_run_restart()) + # + # We still hold a strong reference in self._restart_task: a bare + # asyncio.create_task() keeps only a weak reference, so the event + # loop may garbage-collect a still-pending task mid-flight. The + # cancel loop in _stop_impl explicitly skips _restart_task for the + # same reason it skips _stop_task. + self._restart_task = asyncio.create_task(_run_restart()) return True # Drain-timeout reasons set by _stop_impl() when a still-running turn is @@ -7234,6 +7242,12 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew for _task in list(self._background_tasks): if _task is self._stop_task: continue + if _task is self._restart_task: + # The restart orchestration task is awaiting _stop_task + # right now; cancelling it would propagate CancelledError + # into this _stop_impl and skip _shutdown_event.set() / + # _exit_code = 75 (#12875). It self-terminates anyway. + continue _task.cancel() self._background_tasks.clear() diff --git a/scripts/release.py b/scripts/release.py index 155ffdb69b2..2e2fd3deb6c 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -153,6 +153,7 @@ AUTHOR_MAP = { "290859878+synapsesx@users.noreply.github.com": "synapsesx", "157689911+itsflownium@users.noreply.github.com": "itsflownium", "dirtyren@users.noreply.github.com": "dirtyren", + "--email": "andryypaez@gmail.com", "mucio@mucio.net": "francescomucio", "291572938+thestral123@users.noreply.github.com": "thestral123", "tkwong@inspiresynergy.com": "tkwong", diff --git a/tests/gateway/test_restart_drain.py b/tests/gateway/test_restart_drain.py index 07077539b47..fbdb29dc18d 100644 --- a/tests/gateway/test_restart_drain.py +++ b/tests/gateway/test_restart_drain.py @@ -181,26 +181,64 @@ async def test_request_restart_is_idempotent(): runner, _adapter = make_restart_runner() runner.stop = AsyncMock() - # Patch create_task to capture the restart task (it's no longer in - # _background_tasks — see #12875). - _captured = [] - _orig_create_task = asyncio.create_task - def _capture(coro, **kw): - t = _orig_create_task(coro, **kw) - _captured.append(t) - return t - with pytest.MonkeyPatch.context() as mp: - mp.setattr(asyncio, "create_task", _capture) - assert runner.request_restart(detached=True, via_service=False) is True - assert runner.request_restart(detached=True, via_service=False) is False + # _run_restart is held on self._restart_task and is intentionally NOT in + # _background_tasks, so _stop_impl's cancel loop can't abort it mid-await + # (see #12875). + assert runner.request_restart(detached=True, via_service=False) is True + assert runner._restart_task is not None + assert runner._restart_task not in runner._background_tasks + assert runner.request_restart(detached=True, via_service=False) is False - await _captured[0] + await runner._restart_task runner.stop.assert_awaited_once_with( restart=True, detached_restart=True, service_restart=False ) +@pytest.mark.asyncio +async def test_run_restart_excluded_from_stop_cancel_loop(): + """Regression for #12875: _run_restart is held on self._restart_task and + kept OUT of _background_tasks, and the _stop_impl cancel loop explicitly + skips it. If it were in _background_tasks, the cancel loop (which fires + while _run_restart is awaiting _stop_task) would propagate CancelledError + into _stop_impl and skip _shutdown_event.set() / _exit_code = 75.""" + runner, _adapter = make_restart_runner() + runner.stop = AsyncMock() + + # A decoy background task that SHOULD be cancelled, plus the restart task + # that must NOT be. + async def _decoy(): + await asyncio.sleep(60) + + decoy = asyncio.create_task(_decoy()) + runner._background_tasks.add(decoy) + decoy.add_done_callback(runner._background_tasks.discard) + + assert runner.request_restart(detached=False, via_service=True) is True + restart_task = runner._restart_task + assert restart_task is not None + assert restart_task not in runner._background_tasks + + # Run the real cancel loop body in isolation (mirrors _stop_impl:7234). + runner._stop_task = None + for _task in list(runner._background_tasks): + if _task is runner._stop_task: + continue + if _task is runner._restart_task: + continue + _task.cancel() + + await asyncio.sleep(0) # let cancellation settle + assert decoy.cancelled() + assert not restart_task.cancelled() + + await restart_task + runner.stop.assert_awaited_once_with( + restart=True, detached_restart=False, service_restart=True + ) + + @pytest.mark.asyncio async def test_launch_detached_restart_command_uses_setsid(monkeypatch): runner, _adapter = make_restart_runner()