refactor(gateway): extract MoA one-shot restore helper; restore #28686 comment; real-method tests

Follow-up on the salvaged MoA restore fix:
- Extract the finally-block restore into _restore_moa_one_shot() so the
  behavior is unit-testable without re-implementing it, and so the gateway
  /moa handler and the finally block share one implementation.
- Restore the load-bearing #28686 zombie-eviction comment above
  _release_running_agent_state that the original diff dropped.
- Rewrite the tests to call the real _restore_moa_one_shot helper (the
  originals re-implemented the restore logic inline, so they passed
  regardless of the production code).
This commit is contained in:
teknium1 2026-06-27 03:24:51 -07:00 committed by Teknium
parent 2f29e3cfc5
commit 4e0788783b
2 changed files with 117 additions and 117 deletions

View file

@ -9050,18 +9050,45 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
logger.debug("goal continuation hook failed: %s", _goal_exc)
return _agent_result
finally:
if getattr(event, "_moa_disable_after_turn", False):
try:
_restore = getattr(event, "_moa_restore_override", None)
if _restore is None:
self._session_model_overrides.pop(_quick_key, None)
else:
self._session_model_overrides[_quick_key] = _restore
self._evict_cached_agent(_quick_key)
except Exception:
pass
# MoA one-shot restore must run on EVERY exit path, not just
# success. The restore data lives on the per-turn event object
# (_moa_restore_override), which is discarded once the event goes
# out of scope — so if _handle_message_with_agent raises, a restore
# in the try block would be skipped and the MoA override would leak
# permanently (every later message silently fans out through MoA).
# Putting it in finally guarantees the revert on success, exception,
# and interrupt alike.
self._restore_moa_one_shot(event, _quick_key)
# Unconditional release covers every exit path. _release_running_agent_state
# is idempotent (pop-on-absent is harmless) and, called without a
# run_generation guard, always clears the slot regardless of which
# generation it holds. This evicts the zombie left when session_reset
# bumps the generation (N -> N+1) mid-flight: gen-N's guarded release
# inside _run_agent returns False, and the old sentinel-only check here
# missed the leftover real agent — locking the session out forever (#28686).
self._release_running_agent_state(_quick_key)
def _restore_moa_one_shot(self, event: "MessageEvent", quick_key: str) -> None:
"""Revert a ``/moa <prompt>`` one-shot model override after its turn.
Called from the ``finally`` of the message-handling path so the revert
fires whether the turn succeeded, raised, or was interrupted. A no-op
unless ``event._moa_disable_after_turn`` is set. ``_moa_restore_override``
carries the prior per-session override (``None`` means the user had no
override, so the MoA override is cleared outright).
"""
if not getattr(event, "_moa_disable_after_turn", False):
return
try:
_restore = getattr(event, "_moa_restore_override", None)
if _restore is None:
self._session_model_overrides.pop(quick_key, None)
else:
self._session_model_overrides[quick_key] = _restore
self._evict_cached_agent(quick_key)
except Exception:
pass
async def _prepare_inbound_message_text(
self,
*,

View file

@ -1,130 +1,103 @@
"""MoA one-shot model override must be restored on both success and failure."""
"""MoA one-shot model override must be restored on both success and failure.
These exercise the real ``GatewayRunner._restore_moa_one_shot`` helper that the
message-handling ``finally`` block calls, so they prove the production logic
not a re-implementation of it. The bug being guarded: the restore used to live
in the ``try`` block, so a turn that raised skipped it and the MoA override
leaked permanently (every later message silently fanned out through MoA).
"""
from types import SimpleNamespace
from unittest.mock import AsyncMock, MagicMock, patch
from unittest.mock import MagicMock
import pytest
from gateway.run import GatewayRunner
def _make_runner():
"""Build a minimal GatewayRunner-like object with the fields the
MoA one-shot restore path reads/writes."""
runner = MagicMock()
"""Minimal GatewayRunner with only the fields _restore_moa_one_shot reads."""
runner = object.__new__(GatewayRunner)
runner._session_model_overrides = {}
runner._release_running_agent_state = MagicMock()
runner._evict_cached_agent = MagicMock()
runner._begin_session_run_generation = MagicMock(return_value=1)
return runner
def _make_event(text="hello", moa_disable=False, moa_restore=None):
def _make_event(moa_disable=False, moa_restore=None):
event = SimpleNamespace()
event.text = text
if moa_disable:
event._moa_disable_after_turn = True
event._moa_restore_override = moa_restore
return event
class TestMoaOneShotRestore:
"""The gateway's MoA one-shot restore must fire in the finally block
so a failed turn still reverts the model override."""
def test_restore_reverts_to_previous_override():
"""A one-shot turn restores the prior per-session model override."""
runner = _make_runner()
key = "agent:main:telegram:dm:123"
runner._session_model_overrides[key] = {"provider": "moa", "model": "default"}
event = _make_event(
moa_disable=True,
moa_restore={"provider": "openrouter", "model": "gpt-4"},
)
def test_restore_fires_on_success(self):
"""Normal successful turn restores the previous model override."""
runner = _make_runner()
key = "agent:main:telegram:dm:123"
runner._session_model_overrides[key] = {
"provider": "moa", "model": "default",
}
event = _make_event(
moa_disable=True,
moa_restore={"provider": "openrouter", "model": "gpt-4"},
)
runner._restore_moa_one_shot(event, key)
# Simulate: try block succeeds, finally runs
assert runner._session_model_overrides[key] == {
"provider": "openrouter",
"model": "gpt-4",
}
runner._evict_cached_agent.assert_called_once_with(key)
def test_restore_none_clears_override():
"""If the user had no override before /moa, the MoA override is removed."""
runner = _make_runner()
key = "agent:main:discord:guild:456"
runner._session_model_overrides[key] = {"provider": "moa", "model": "default"}
event = _make_event(moa_disable=True, moa_restore=None)
runner._restore_moa_one_shot(event, key)
assert key not in runner._session_model_overrides
runner._evict_cached_agent.assert_called_once_with(key)
def test_no_restore_for_non_one_shot_turn():
"""Normal (non-MoA) turns must not touch model overrides or evict agents."""
runner = _make_runner()
key = "agent:main:slack:channel:789"
runner._session_model_overrides[key] = {"provider": "openrouter", "model": "gpt-4"}
event = _make_event() # no _moa_disable_after_turn
runner._restore_moa_one_shot(event, key)
assert runner._session_model_overrides[key] == {
"provider": "openrouter",
"model": "gpt-4",
}
runner._evict_cached_agent.assert_not_called()
def test_restore_runs_from_finally_even_when_turn_raises():
"""The whole point of the fix: a raising turn still reverts the override.
Mirrors the real call site the restore is invoked from a ``finally`` block,
so it fires after an exception propagates out of the turn body.
"""
runner = _make_runner()
key = "agent:main:telegram:dm:999"
runner._session_model_overrides[key] = {"provider": "moa", "model": "default"}
event = _make_event(
moa_disable=True,
moa_restore={"provider": "openrouter", "model": "gpt-4"},
)
with __import__("pytest").raises(RuntimeError):
try:
pass # _handle_message_with_agent succeeds
raise RuntimeError("provider error mid-turn")
finally:
if getattr(event, "_moa_disable_after_turn", False):
_restore = getattr(event, "_moa_restore_override", None)
if _restore is None:
runner._session_model_overrides.pop(key, None)
else:
runner._session_model_overrides[key] = _restore
runner._restore_moa_one_shot(event, key)
assert runner._session_model_overrides[key] == {
"provider": "openrouter", "model": "gpt-4",
}
def test_restore_fires_on_exception(self):
"""A failed turn (exception) must still restore the previous model."""
runner = _make_runner()
key = "agent:main:telegram:dm:123"
runner._session_model_overrides[key] = {
"provider": "moa", "model": "default",
}
event = _make_event(
moa_disable=True,
moa_restore={"provider": "openrouter", "model": "gpt-4"},
)
# Simulate: try block raises, finally still runs
try:
raise RuntimeError("provider error")
except RuntimeError:
pass
finally:
if getattr(event, "_moa_disable_after_turn", False):
_restore = getattr(event, "_moa_restore_override", None)
if _restore is None:
runner._session_model_overrides.pop(key, None)
else:
runner._session_model_overrides[key] = _restore
assert runner._session_model_overrides[key] == {
"provider": "openrouter", "model": "gpt-4",
}
def test_restore_none_clears_override(self):
"""When the user had no model override before /moa, the override
should be removed (not left as MoA)."""
runner = _make_runner()
key = "agent:main:discord:guild:456"
runner._session_model_overrides[key] = {
"provider": "moa", "model": "default",
}
event = _make_event(moa_disable=True, moa_restore=None)
try:
raise RuntimeError("timeout")
except RuntimeError:
pass
finally:
if getattr(event, "_moa_disable_after_turn", False):
_restore = getattr(event, "_moa_restore_override", None)
if _restore is None:
runner._session_model_overrides.pop(key, None)
else:
runner._session_model_overrides[key] = _restore
assert key not in runner._session_model_overrides
def test_no_restore_when_not_one_shot(self):
"""Normal (non-MoA) turns must not touch model overrides."""
runner = _make_runner()
key = "agent:main:slack:channel:789"
runner._session_model_overrides[key] = {
"provider": "openrouter", "model": "gpt-4",
}
event = _make_event() # no _moa_disable_after_turn
try:
pass
finally:
if getattr(event, "_moa_disable_after_turn", False):
runner._session_model_overrides.pop(key, None)
assert runner._session_model_overrides[key] == {
"provider": "openrouter", "model": "gpt-4",
}
assert runner._session_model_overrides[key] == {
"provider": "openrouter",
"model": "gpt-4",
}