From 4e0788783b10dc7adb2689ab6d8b01b0485ed4e6 Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Sat, 27 Jun 2026 03:24:51 -0700 Subject: [PATCH] refactor(gateway): extract MoA one-shot restore helper; restore #28686 comment; real-method tests Follow-up on the salvaged MoA restore fix: - Extract the finally-block restore into _restore_moa_one_shot() so the behavior is unit-testable without re-implementing it, and so the gateway /moa handler and the finally block share one implementation. - Restore the load-bearing #28686 zombie-eviction comment above _release_running_agent_state that the original diff dropped. - Rewrite the tests to call the real _restore_moa_one_shot helper (the originals re-implemented the restore logic inline, so they passed regardless of the production code). --- gateway/run.py | 47 ++++-- tests/gateway/test_moa_one_shot_restore.py | 187 +++++++++------------ 2 files changed, 117 insertions(+), 117 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index bff170533f8..e8c82a0ccd4 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -9050,18 +9050,45 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew logger.debug("goal continuation hook failed: %s", _goal_exc) return _agent_result finally: - if getattr(event, "_moa_disable_after_turn", False): - try: - _restore = getattr(event, "_moa_restore_override", None) - if _restore is None: - self._session_model_overrides.pop(_quick_key, None) - else: - self._session_model_overrides[_quick_key] = _restore - self._evict_cached_agent(_quick_key) - except Exception: - pass + # MoA one-shot restore must run on EVERY exit path, not just + # success. The restore data lives on the per-turn event object + # (_moa_restore_override), which is discarded once the event goes + # out of scope — so if _handle_message_with_agent raises, a restore + # in the try block would be skipped and the MoA override would leak + # permanently (every later message silently fans out through MoA). + # Putting it in finally guarantees the revert on success, exception, + # and interrupt alike. + self._restore_moa_one_shot(event, _quick_key) + # Unconditional release covers every exit path. _release_running_agent_state + # is idempotent (pop-on-absent is harmless) and, called without a + # run_generation guard, always clears the slot regardless of which + # generation it holds. This evicts the zombie left when session_reset + # bumps the generation (N -> N+1) mid-flight: gen-N's guarded release + # inside _run_agent returns False, and the old sentinel-only check here + # missed the leftover real agent — locking the session out forever (#28686). self._release_running_agent_state(_quick_key) + def _restore_moa_one_shot(self, event: "MessageEvent", quick_key: str) -> None: + """Revert a ``/moa `` one-shot model override after its turn. + + Called from the ``finally`` of the message-handling path so the revert + fires whether the turn succeeded, raised, or was interrupted. A no-op + unless ``event._moa_disable_after_turn`` is set. ``_moa_restore_override`` + carries the prior per-session override (``None`` means the user had no + override, so the MoA override is cleared outright). + """ + if not getattr(event, "_moa_disable_after_turn", False): + return + try: + _restore = getattr(event, "_moa_restore_override", None) + if _restore is None: + self._session_model_overrides.pop(quick_key, None) + else: + self._session_model_overrides[quick_key] = _restore + self._evict_cached_agent(quick_key) + except Exception: + pass + async def _prepare_inbound_message_text( self, *, diff --git a/tests/gateway/test_moa_one_shot_restore.py b/tests/gateway/test_moa_one_shot_restore.py index 5d4ce707a6b..25e9e54fa06 100644 --- a/tests/gateway/test_moa_one_shot_restore.py +++ b/tests/gateway/test_moa_one_shot_restore.py @@ -1,130 +1,103 @@ -"""MoA one-shot model override must be restored on both success and failure.""" +"""MoA one-shot model override must be restored on both success and failure. + +These exercise the real ``GatewayRunner._restore_moa_one_shot`` helper that the +message-handling ``finally`` block calls, so they prove the production logic — +not a re-implementation of it. The bug being guarded: the restore used to live +in the ``try`` block, so a turn that raised skipped it and the MoA override +leaked permanently (every later message silently fanned out through MoA). +""" from types import SimpleNamespace -from unittest.mock import AsyncMock, MagicMock, patch +from unittest.mock import MagicMock -import pytest +from gateway.run import GatewayRunner def _make_runner(): - """Build a minimal GatewayRunner-like object with the fields the - MoA one-shot restore path reads/writes.""" - runner = MagicMock() + """Minimal GatewayRunner with only the fields _restore_moa_one_shot reads.""" + runner = object.__new__(GatewayRunner) runner._session_model_overrides = {} - runner._release_running_agent_state = MagicMock() runner._evict_cached_agent = MagicMock() - runner._begin_session_run_generation = MagicMock(return_value=1) return runner -def _make_event(text="hello", moa_disable=False, moa_restore=None): +def _make_event(moa_disable=False, moa_restore=None): event = SimpleNamespace() - event.text = text if moa_disable: event._moa_disable_after_turn = True event._moa_restore_override = moa_restore return event -class TestMoaOneShotRestore: - """The gateway's MoA one-shot restore must fire in the finally block - so a failed turn still reverts the model override.""" +def test_restore_reverts_to_previous_override(): + """A one-shot turn restores the prior per-session model override.""" + runner = _make_runner() + key = "agent:main:telegram:dm:123" + runner._session_model_overrides[key] = {"provider": "moa", "model": "default"} + event = _make_event( + moa_disable=True, + moa_restore={"provider": "openrouter", "model": "gpt-4"}, + ) - def test_restore_fires_on_success(self): - """Normal successful turn restores the previous model override.""" - runner = _make_runner() - key = "agent:main:telegram:dm:123" - runner._session_model_overrides[key] = { - "provider": "moa", "model": "default", - } - event = _make_event( - moa_disable=True, - moa_restore={"provider": "openrouter", "model": "gpt-4"}, - ) + runner._restore_moa_one_shot(event, key) - # Simulate: try block succeeds, finally runs + assert runner._session_model_overrides[key] == { + "provider": "openrouter", + "model": "gpt-4", + } + runner._evict_cached_agent.assert_called_once_with(key) + + +def test_restore_none_clears_override(): + """If the user had no override before /moa, the MoA override is removed.""" + runner = _make_runner() + key = "agent:main:discord:guild:456" + runner._session_model_overrides[key] = {"provider": "moa", "model": "default"} + event = _make_event(moa_disable=True, moa_restore=None) + + runner._restore_moa_one_shot(event, key) + + assert key not in runner._session_model_overrides + runner._evict_cached_agent.assert_called_once_with(key) + + +def test_no_restore_for_non_one_shot_turn(): + """Normal (non-MoA) turns must not touch model overrides or evict agents.""" + runner = _make_runner() + key = "agent:main:slack:channel:789" + runner._session_model_overrides[key] = {"provider": "openrouter", "model": "gpt-4"} + event = _make_event() # no _moa_disable_after_turn + + runner._restore_moa_one_shot(event, key) + + assert runner._session_model_overrides[key] == { + "provider": "openrouter", + "model": "gpt-4", + } + runner._evict_cached_agent.assert_not_called() + + +def test_restore_runs_from_finally_even_when_turn_raises(): + """The whole point of the fix: a raising turn still reverts the override. + + Mirrors the real call site — the restore is invoked from a ``finally`` block, + so it fires after an exception propagates out of the turn body. + """ + runner = _make_runner() + key = "agent:main:telegram:dm:999" + runner._session_model_overrides[key] = {"provider": "moa", "model": "default"} + event = _make_event( + moa_disable=True, + moa_restore={"provider": "openrouter", "model": "gpt-4"}, + ) + + with __import__("pytest").raises(RuntimeError): try: - pass # _handle_message_with_agent succeeds + raise RuntimeError("provider error mid-turn") finally: - if getattr(event, "_moa_disable_after_turn", False): - _restore = getattr(event, "_moa_restore_override", None) - if _restore is None: - runner._session_model_overrides.pop(key, None) - else: - runner._session_model_overrides[key] = _restore + runner._restore_moa_one_shot(event, key) - assert runner._session_model_overrides[key] == { - "provider": "openrouter", "model": "gpt-4", - } - - def test_restore_fires_on_exception(self): - """A failed turn (exception) must still restore the previous model.""" - runner = _make_runner() - key = "agent:main:telegram:dm:123" - runner._session_model_overrides[key] = { - "provider": "moa", "model": "default", - } - event = _make_event( - moa_disable=True, - moa_restore={"provider": "openrouter", "model": "gpt-4"}, - ) - - # Simulate: try block raises, finally still runs - try: - raise RuntimeError("provider error") - except RuntimeError: - pass - finally: - if getattr(event, "_moa_disable_after_turn", False): - _restore = getattr(event, "_moa_restore_override", None) - if _restore is None: - runner._session_model_overrides.pop(key, None) - else: - runner._session_model_overrides[key] = _restore - - assert runner._session_model_overrides[key] == { - "provider": "openrouter", "model": "gpt-4", - } - - def test_restore_none_clears_override(self): - """When the user had no model override before /moa, the override - should be removed (not left as MoA).""" - runner = _make_runner() - key = "agent:main:discord:guild:456" - runner._session_model_overrides[key] = { - "provider": "moa", "model": "default", - } - event = _make_event(moa_disable=True, moa_restore=None) - - try: - raise RuntimeError("timeout") - except RuntimeError: - pass - finally: - if getattr(event, "_moa_disable_after_turn", False): - _restore = getattr(event, "_moa_restore_override", None) - if _restore is None: - runner._session_model_overrides.pop(key, None) - else: - runner._session_model_overrides[key] = _restore - - assert key not in runner._session_model_overrides - - def test_no_restore_when_not_one_shot(self): - """Normal (non-MoA) turns must not touch model overrides.""" - runner = _make_runner() - key = "agent:main:slack:channel:789" - runner._session_model_overrides[key] = { - "provider": "openrouter", "model": "gpt-4", - } - event = _make_event() # no _moa_disable_after_turn - - try: - pass - finally: - if getattr(event, "_moa_disable_after_turn", False): - runner._session_model_overrides.pop(key, None) - - assert runner._session_model_overrides[key] == { - "provider": "openrouter", "model": "gpt-4", - } + assert runner._session_model_overrides[key] == { + "provider": "openrouter", + "model": "gpt-4", + }