fix(gateway): hold _run_restart on _restart_task + explicit cancel-loop skip

Follow-up on the cherry-picked #13173 fix. Holds the _run_restart task in
self._restart_task (a bare asyncio.create_task keeps only a weak reference,
so a still-pending task can be GC'd mid-flight) and explicitly skips it in
the _stop_impl cancel loop alongside _stop_task. Adds AUTHOR_MAP entry for
the contributor and a regression test that fails when the task is cancellable.

Refs #12875
This commit is contained in:
teknium1 2026-06-27 03:45:07 -07:00 committed by Teknium
parent 1ce5d6d974
commit f2ca3e3d84
3 changed files with 67 additions and 14 deletions

View file

@ -2559,6 +2559,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
_restart_via_service: bool = False
_restart_command_source: Optional[SessionSource] = None
_stop_task: Optional[asyncio.Task] = None
_restart_task: Optional[asyncio.Task] = None
_session_model_overrides: Dict[str, Dict[str, str]] = {}
_session_reasoning_overrides: Dict[str, Dict[str, Any]] = {}
_startup_restore_in_progress: bool = False
@ -2639,6 +2640,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
self._restart_via_service = False
self._restart_command_source: Optional[SessionSource] = None
self._stop_task: Optional[asyncio.Task] = None
self._restart_task: Optional[asyncio.Task] = None
# Track running agents per session for interrupt support
# Key: session_key, Value: AIAgent instance
@ -5526,7 +5528,13 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
# while it's awaiting _stop_task, propagating CancelledError into
# _stop_impl and preventing _shutdown_event.set() / _exit_code = 75.
# See #12875.
asyncio.create_task(_run_restart())
#
# We still hold a strong reference in self._restart_task: a bare
# asyncio.create_task() keeps only a weak reference, so the event
# loop may garbage-collect a still-pending task mid-flight. The
# cancel loop in _stop_impl explicitly skips _restart_task for the
# same reason it skips _stop_task.
self._restart_task = asyncio.create_task(_run_restart())
return True
# Drain-timeout reasons set by _stop_impl() when a still-running turn is
@ -7234,6 +7242,12 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
for _task in list(self._background_tasks):
if _task is self._stop_task:
continue
if _task is self._restart_task:
# The restart orchestration task is awaiting _stop_task
# right now; cancelling it would propagate CancelledError
# into this _stop_impl and skip _shutdown_event.set() /
# _exit_code = 75 (#12875). It self-terminates anyway.
continue
_task.cancel()
self._background_tasks.clear()

View file

@ -153,6 +153,7 @@ AUTHOR_MAP = {
"290859878+synapsesx@users.noreply.github.com": "synapsesx",
"157689911+itsflownium@users.noreply.github.com": "itsflownium",
"dirtyren@users.noreply.github.com": "dirtyren",
"--email": "andryypaez@gmail.com",
"mucio@mucio.net": "francescomucio",
"291572938+thestral123@users.noreply.github.com": "thestral123",
"tkwong@inspiresynergy.com": "tkwong",

View file

@ -181,26 +181,64 @@ async def test_request_restart_is_idempotent():
runner, _adapter = make_restart_runner()
runner.stop = AsyncMock()
# Patch create_task to capture the restart task (it's no longer in
# _background_tasks — see #12875).
_captured = []
_orig_create_task = asyncio.create_task
def _capture(coro, **kw):
t = _orig_create_task(coro, **kw)
_captured.append(t)
return t
with pytest.MonkeyPatch.context() as mp:
mp.setattr(asyncio, "create_task", _capture)
assert runner.request_restart(detached=True, via_service=False) is True
assert runner.request_restart(detached=True, via_service=False) is False
# _run_restart is held on self._restart_task and is intentionally NOT in
# _background_tasks, so _stop_impl's cancel loop can't abort it mid-await
# (see #12875).
assert runner.request_restart(detached=True, via_service=False) is True
assert runner._restart_task is not None
assert runner._restart_task not in runner._background_tasks
assert runner.request_restart(detached=True, via_service=False) is False
await _captured[0]
await runner._restart_task
runner.stop.assert_awaited_once_with(
restart=True, detached_restart=True, service_restart=False
)
@pytest.mark.asyncio
async def test_run_restart_excluded_from_stop_cancel_loop():
"""Regression for #12875: _run_restart is held on self._restart_task and
kept OUT of _background_tasks, and the _stop_impl cancel loop explicitly
skips it. If it were in _background_tasks, the cancel loop (which fires
while _run_restart is awaiting _stop_task) would propagate CancelledError
into _stop_impl and skip _shutdown_event.set() / _exit_code = 75."""
runner, _adapter = make_restart_runner()
runner.stop = AsyncMock()
# A decoy background task that SHOULD be cancelled, plus the restart task
# that must NOT be.
async def _decoy():
await asyncio.sleep(60)
decoy = asyncio.create_task(_decoy())
runner._background_tasks.add(decoy)
decoy.add_done_callback(runner._background_tasks.discard)
assert runner.request_restart(detached=False, via_service=True) is True
restart_task = runner._restart_task
assert restart_task is not None
assert restart_task not in runner._background_tasks
# Run the real cancel loop body in isolation (mirrors _stop_impl:7234).
runner._stop_task = None
for _task in list(runner._background_tasks):
if _task is runner._stop_task:
continue
if _task is runner._restart_task:
continue
_task.cancel()
await asyncio.sleep(0) # let cancellation settle
assert decoy.cancelled()
assert not restart_task.cancelled()
await restart_task
runner.stop.assert_awaited_once_with(
restart=True, detached_restart=False, service_restart=True
)
@pytest.mark.asyncio
async def test_launch_detached_restart_command_uses_setsid(monkeypatch):
runner, _adapter = make_restart_runner()