From 2f29e3cfc584d48eabafc8516cff66d4952cd2b8 Mon Sep 17 00:00:00 2001 From: srojk34 <286497132+srojk34@users.noreply.github.com> Date: Sat, 27 Jun 2026 12:08:50 +0300 Subject: [PATCH] fix(gateway): restore MoA one-shot model override on failed turns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MoA one-shot restore ran inside the try block after _handle_message_with_agent returned. When that call raised an exception (agent init failure, interpreter shutdown, OOM), the restore was skipped and the MoA model override stayed permanently on _session_model_overrides — silently routing all subsequent messages through the MoA reference fan-out with no user-visible indication. Move the restore to the finally block so it fires on every exit path (success, exception, interrupt). The restore data lives on the per-turn event object and would be lost if not consumed here. --- gateway/run.py | 27 ++--- tests/gateway/test_moa_one_shot_restore.py | 130 +++++++++++++++++++++ 2 files changed, 140 insertions(+), 17 deletions(-) create mode 100644 tests/gateway/test_moa_one_shot_restore.py diff --git a/gateway/run.py b/gateway/run.py index e013e102941..bff170533f8 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -9020,16 +9020,6 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew try: _agent_result = await self._handle_message_with_agent(event, source, _quick_key, _run_generation) - if getattr(event, "_moa_disable_after_turn", False): - try: - _restore = getattr(event, "_moa_restore_override", None) - if _restore is None: - self._session_model_overrides.pop(_quick_key, None) - else: - self._session_model_overrides[_quick_key] = _restore - self._evict_cached_agent(_quick_key) - except Exception: - pass # Goal continuation: after the agent returns a final response # for this turn, check any standing /goal — the judge will # either mark it done, pause it (budget), or enqueue a @@ -9060,13 +9050,16 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew logger.debug("goal continuation hook failed: %s", _goal_exc) return _agent_result finally: - # Unconditional release covers every exit path. _release_running_agent_state - # is idempotent (pop-on-absent is harmless) and, called without a - # run_generation guard, always clears the slot regardless of which - # generation it holds. This evicts the zombie left when session_reset - # bumps the generation (N -> N+1) mid-flight: gen-N's guarded release - # inside _run_agent returns False, and the old sentinel-only check here - # missed the leftover real agent — locking the session out forever (#28686). + if getattr(event, "_moa_disable_after_turn", False): + try: + _restore = getattr(event, "_moa_restore_override", None) + if _restore is None: + self._session_model_overrides.pop(_quick_key, None) + else: + self._session_model_overrides[_quick_key] = _restore + self._evict_cached_agent(_quick_key) + except Exception: + pass self._release_running_agent_state(_quick_key) async def _prepare_inbound_message_text( diff --git a/tests/gateway/test_moa_one_shot_restore.py b/tests/gateway/test_moa_one_shot_restore.py new file mode 100644 index 00000000000..5d4ce707a6b --- /dev/null +++ b/tests/gateway/test_moa_one_shot_restore.py @@ -0,0 +1,130 @@ +"""MoA one-shot model override must be restored on both success and failure.""" + +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + + +def _make_runner(): + """Build a minimal GatewayRunner-like object with the fields the + MoA one-shot restore path reads/writes.""" + runner = MagicMock() + runner._session_model_overrides = {} + runner._release_running_agent_state = MagicMock() + runner._evict_cached_agent = MagicMock() + runner._begin_session_run_generation = MagicMock(return_value=1) + return runner + + +def _make_event(text="hello", moa_disable=False, moa_restore=None): + event = SimpleNamespace() + event.text = text + if moa_disable: + event._moa_disable_after_turn = True + event._moa_restore_override = moa_restore + return event + + +class TestMoaOneShotRestore: + """The gateway's MoA one-shot restore must fire in the finally block + so a failed turn still reverts the model override.""" + + def test_restore_fires_on_success(self): + """Normal successful turn restores the previous model override.""" + runner = _make_runner() + key = "agent:main:telegram:dm:123" + runner._session_model_overrides[key] = { + "provider": "moa", "model": "default", + } + event = _make_event( + moa_disable=True, + moa_restore={"provider": "openrouter", "model": "gpt-4"}, + ) + + # Simulate: try block succeeds, finally runs + try: + pass # _handle_message_with_agent succeeds + finally: + if getattr(event, "_moa_disable_after_turn", False): + _restore = getattr(event, "_moa_restore_override", None) + if _restore is None: + runner._session_model_overrides.pop(key, None) + else: + runner._session_model_overrides[key] = _restore + + assert runner._session_model_overrides[key] == { + "provider": "openrouter", "model": "gpt-4", + } + + def test_restore_fires_on_exception(self): + """A failed turn (exception) must still restore the previous model.""" + runner = _make_runner() + key = "agent:main:telegram:dm:123" + runner._session_model_overrides[key] = { + "provider": "moa", "model": "default", + } + event = _make_event( + moa_disable=True, + moa_restore={"provider": "openrouter", "model": "gpt-4"}, + ) + + # Simulate: try block raises, finally still runs + try: + raise RuntimeError("provider error") + except RuntimeError: + pass + finally: + if getattr(event, "_moa_disable_after_turn", False): + _restore = getattr(event, "_moa_restore_override", None) + if _restore is None: + runner._session_model_overrides.pop(key, None) + else: + runner._session_model_overrides[key] = _restore + + assert runner._session_model_overrides[key] == { + "provider": "openrouter", "model": "gpt-4", + } + + def test_restore_none_clears_override(self): + """When the user had no model override before /moa, the override + should be removed (not left as MoA).""" + runner = _make_runner() + key = "agent:main:discord:guild:456" + runner._session_model_overrides[key] = { + "provider": "moa", "model": "default", + } + event = _make_event(moa_disable=True, moa_restore=None) + + try: + raise RuntimeError("timeout") + except RuntimeError: + pass + finally: + if getattr(event, "_moa_disable_after_turn", False): + _restore = getattr(event, "_moa_restore_override", None) + if _restore is None: + runner._session_model_overrides.pop(key, None) + else: + runner._session_model_overrides[key] = _restore + + assert key not in runner._session_model_overrides + + def test_no_restore_when_not_one_shot(self): + """Normal (non-MoA) turns must not touch model overrides.""" + runner = _make_runner() + key = "agent:main:slack:channel:789" + runner._session_model_overrides[key] = { + "provider": "openrouter", "model": "gpt-4", + } + event = _make_event() # no _moa_disable_after_turn + + try: + pass + finally: + if getattr(event, "_moa_disable_after_turn", False): + runner._session_model_overrides.pop(key, None) + + assert runner._session_model_overrides[key] == { + "provider": "openrouter", "model": "gpt-4", + }