fix(gateway): restore MoA one-shot model override on failed turns

The MoA one-shot restore ran inside the try block after
_handle_message_with_agent returned. When that call raised an
exception (agent init failure, interpreter shutdown, OOM), the
restore was skipped and the MoA model override stayed permanently
on _session_model_overrides — silently routing all subsequent
messages through the MoA reference fan-out with no user-visible
indication.

Move the restore to the finally block so it fires on every exit
path (success, exception, interrupt). The restore data lives on
the per-turn event object and would be lost if not consumed here.
This commit is contained in:
srojk34 2026-06-27 12:08:50 +03:00 committed by Teknium
parent 17cb829991
commit 2f29e3cfc5
2 changed files with 140 additions and 17 deletions

View file

@ -9020,16 +9020,6 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
try:
_agent_result = await self._handle_message_with_agent(event, source, _quick_key, _run_generation)
if getattr(event, "_moa_disable_after_turn", False):
try:
_restore = getattr(event, "_moa_restore_override", None)
if _restore is None:
self._session_model_overrides.pop(_quick_key, None)
else:
self._session_model_overrides[_quick_key] = _restore
self._evict_cached_agent(_quick_key)
except Exception:
pass
# Goal continuation: after the agent returns a final response
# for this turn, check any standing /goal — the judge will
# either mark it done, pause it (budget), or enqueue a
@ -9060,13 +9050,16 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
logger.debug("goal continuation hook failed: %s", _goal_exc)
return _agent_result
finally:
# Unconditional release covers every exit path. _release_running_agent_state
# is idempotent (pop-on-absent is harmless) and, called without a
# run_generation guard, always clears the slot regardless of which
# generation it holds. This evicts the zombie left when session_reset
# bumps the generation (N -> N+1) mid-flight: gen-N's guarded release
# inside _run_agent returns False, and the old sentinel-only check here
# missed the leftover real agent — locking the session out forever (#28686).
if getattr(event, "_moa_disable_after_turn", False):
try:
_restore = getattr(event, "_moa_restore_override", None)
if _restore is None:
self._session_model_overrides.pop(_quick_key, None)
else:
self._session_model_overrides[_quick_key] = _restore
self._evict_cached_agent(_quick_key)
except Exception:
pass
self._release_running_agent_state(_quick_key)
async def _prepare_inbound_message_text(

View file

@ -0,0 +1,130 @@
"""MoA one-shot model override must be restored on both success and failure."""
from types import SimpleNamespace
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
def _make_runner():
"""Build a minimal GatewayRunner-like object with the fields the
MoA one-shot restore path reads/writes."""
runner = MagicMock()
runner._session_model_overrides = {}
runner._release_running_agent_state = MagicMock()
runner._evict_cached_agent = MagicMock()
runner._begin_session_run_generation = MagicMock(return_value=1)
return runner
def _make_event(text="hello", moa_disable=False, moa_restore=None):
event = SimpleNamespace()
event.text = text
if moa_disable:
event._moa_disable_after_turn = True
event._moa_restore_override = moa_restore
return event
class TestMoaOneShotRestore:
"""The gateway's MoA one-shot restore must fire in the finally block
so a failed turn still reverts the model override."""
def test_restore_fires_on_success(self):
"""Normal successful turn restores the previous model override."""
runner = _make_runner()
key = "agent:main:telegram:dm:123"
runner._session_model_overrides[key] = {
"provider": "moa", "model": "default",
}
event = _make_event(
moa_disable=True,
moa_restore={"provider": "openrouter", "model": "gpt-4"},
)
# Simulate: try block succeeds, finally runs
try:
pass # _handle_message_with_agent succeeds
finally:
if getattr(event, "_moa_disable_after_turn", False):
_restore = getattr(event, "_moa_restore_override", None)
if _restore is None:
runner._session_model_overrides.pop(key, None)
else:
runner._session_model_overrides[key] = _restore
assert runner._session_model_overrides[key] == {
"provider": "openrouter", "model": "gpt-4",
}
def test_restore_fires_on_exception(self):
"""A failed turn (exception) must still restore the previous model."""
runner = _make_runner()
key = "agent:main:telegram:dm:123"
runner._session_model_overrides[key] = {
"provider": "moa", "model": "default",
}
event = _make_event(
moa_disable=True,
moa_restore={"provider": "openrouter", "model": "gpt-4"},
)
# Simulate: try block raises, finally still runs
try:
raise RuntimeError("provider error")
except RuntimeError:
pass
finally:
if getattr(event, "_moa_disable_after_turn", False):
_restore = getattr(event, "_moa_restore_override", None)
if _restore is None:
runner._session_model_overrides.pop(key, None)
else:
runner._session_model_overrides[key] = _restore
assert runner._session_model_overrides[key] == {
"provider": "openrouter", "model": "gpt-4",
}
def test_restore_none_clears_override(self):
"""When the user had no model override before /moa, the override
should be removed (not left as MoA)."""
runner = _make_runner()
key = "agent:main:discord:guild:456"
runner._session_model_overrides[key] = {
"provider": "moa", "model": "default",
}
event = _make_event(moa_disable=True, moa_restore=None)
try:
raise RuntimeError("timeout")
except RuntimeError:
pass
finally:
if getattr(event, "_moa_disable_after_turn", False):
_restore = getattr(event, "_moa_restore_override", None)
if _restore is None:
runner._session_model_overrides.pop(key, None)
else:
runner._session_model_overrides[key] = _restore
assert key not in runner._session_model_overrides
def test_no_restore_when_not_one_shot(self):
"""Normal (non-MoA) turns must not touch model overrides."""
runner = _make_runner()
key = "agent:main:slack:channel:789"
runner._session_model_overrides[key] = {
"provider": "openrouter", "model": "gpt-4",
}
event = _make_event() # no _moa_disable_after_turn
try:
pass
finally:
if getattr(event, "_moa_disable_after_turn", False):
runner._session_model_overrides.pop(key, None)
assert runner._session_model_overrides[key] == {
"provider": "openrouter", "model": "gpt-4",
}