From 99f3072aa06ac9a858ea5f1a753a801c15d76d5e Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sun, 21 Jun 2026 13:33:23 -0700 Subject: [PATCH] fix(model-switch): a failed in-place swap must be a no-op, not a dead session (#50375) When a /model switch resolves a valid model but the in-place agent swap fails mid-conversation (expired key, unreachable base_url), the agent rolls itself back to the old working model+client and re-raises. The callers caught that re-raise, logged a warning, then committed the broken switch anyway: wrote the failed model to the session DB, set _session_model_overrides to the broken model/provider/key, and (gateway direct path) evicted the working cached agent. The next message then rebuilt a dead agent from the broken override -> permanently unusable conversation (#50163). Fix the whole caller class so a failed swap aborts the commit entirely: - gateway/slash_commands.py (picker + direct /model paths): on swap failure, early-return an error message; skip DB persist, session override, cache eviction, and config write. - cli.py (both /model handlers): snapshot CLI-level credential/runtime fields before mutating, restore them on swap failure, and abort the note + success print. - tui_gateway/server.py: wrap the previously-unguarded swap; on failure raise a clean error and skip worker restart, runtime persist, switch marker, session model_override, and config persist. The no-cached-agent path (apply-on-next-session) is unaffected. Adds a gateway regression test that fails on the pre-fix behavior. --- cli.py | 49 +++++++++++++++++- gateway/slash_commands.py | 33 +++++++++++- .../test_model_command_expensive_confirm.py | 50 +++++++++++++++++++ tui_gateway/server.py | 28 ++++++++--- 4 files changed, 150 insertions(+), 10 deletions(-) diff --git a/cli.py b/cli.py index e15b54b6815..10846775fc2 100644 --- a/cli.py +++ b/cli.py @@ -7054,6 +7054,21 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin): logger.debug("preflight-compression switch warning failed: %s", exc) old_model = self.model + # Snapshot the CLI-level credential/runtime fields BEFORE mutating them + # so a failed in-place agent swap can roll the whole CLI back to the old + # working model. Otherwise the broken credentials staged below leak into + # the next turn's resolution even though the agent itself rolled back + # (#50163). + _cli_snapshot = { + "model": self.model, + "provider": self.provider, + "requested_provider": self.requested_provider, + "_explicit_api_key": getattr(self, "_explicit_api_key", None), + "_explicit_base_url": getattr(self, "_explicit_base_url", None), + "api_key": self.api_key, + "base_url": self.base_url, + "api_mode": self.api_mode, + } self.model = result.new_model self.provider = result.target_provider self.requested_provider = result.target_provider @@ -7079,7 +7094,17 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin): api_mode=result.api_mode, ) except Exception as exc: - _cprint(f" ⚠ Agent swap failed ({exc}); change applied to next session.") + # The agent rolled itself back to the old working model/client. + # Roll the CLI's own staged fields back too and abort the rest + # of the commit (note + success print) so a failed switch is a + # no-op rather than a dead session (#50163). + for _k, _v in _cli_snapshot.items(): + setattr(self, _k, _v) + _cprint( + f" ⚠ Model switch to {result.new_model} failed ({exc}); " + f"staying on {old_model}." + ) + return self._pending_model_switch_note = ( f"[Note: model was just switched from {old_model} to {result.new_model} " @@ -7340,6 +7365,18 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin): # Update requested_provider so _ensure_runtime_credentials() doesn't # overwrite the switch on the next turn (it re-resolves from this). old_model = self.model + # Snapshot CLI-level fields before mutation so a failed in-place swap + # rolls the whole CLI back to the old working model (#50163). + _cli_snapshot = { + "model": self.model, + "provider": self.provider, + "requested_provider": self.requested_provider, + "_explicit_api_key": getattr(self, "_explicit_api_key", None), + "_explicit_base_url": getattr(self, "_explicit_base_url", None), + "api_key": self.api_key, + "base_url": self.base_url, + "api_mode": self.api_mode, + } self.model = result.new_model self.provider = result.target_provider self.requested_provider = result.target_provider @@ -7366,7 +7403,15 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin): api_mode=result.api_mode, ) except Exception as exc: - _cprint(f" ⚠ Agent swap failed ({exc}); change applied to next session.") + # Agent rolled itself back; roll the CLI back too and abort so a + # failed switch is a no-op rather than a dead session (#50163). + for _k, _v in _cli_snapshot.items(): + setattr(self, _k, _v) + _cprint( + f" ⚠ Model switch to {result.new_model} failed ({exc}); " + f"staying on {old_model}." + ) + return # Store a note to prepend to the next user message so the model # knows a switch occurred (avoids injecting system messages mid-history diff --git a/gateway/slash_commands.py b/gateway/slash_commands.py index e5baf8693b2..ca519413a07 100644 --- a/gateway/slash_commands.py +++ b/gateway/slash_commands.py @@ -1193,7 +1193,25 @@ class GatewaySlashCommandsMixin: api_mode=result.api_mode, ) except Exception as exc: - logger.warning("Picker model switch failed for cached agent: %s", exc) + # The in-place swap rolled the agent back to the + # OLD working model/client and re-raised. Abort + # the rest of the commit: do NOT persist the + # failed model to the DB, do NOT set a session + # override pointing at the broken model, and do + # NOT evict the working cached agent. Otherwise + # the next message rebuilds a dead agent from the + # broken override and the conversation is lost + # (#50163). A failed switch must be a no-op. + logger.warning( + "Picker model switch failed for cached agent: %s", exc + ) + return t( + "gateway.model.error_prefix", + error=( + f"Model switch to {result.new_model} failed ({exc}); " + f"staying on {_cur_model}." + ), + ) # Persist the new model to the session DB so the # dashboard shows the updated model (#34850). @@ -1399,7 +1417,20 @@ class GatewaySlashCommandsMixin: api_mode=result.api_mode, ) except Exception as exc: + # In-place swap rolled the agent back to the OLD working + # model/client and re-raised. Abort the commit: skip DB + # persist, session override, cache eviction, and config + # write so a failed switch is a no-op rather than a dead + # conversation (#50163). Without this early return the + # next message rebuilds a broken agent from the override. logger.warning("In-place model switch failed for cached agent: %s", exc) + return t( + "gateway.model.error_prefix", + error=( + f"Model switch to {result.new_model} failed ({exc}); " + f"staying on {current_model}." + ), + ) # Persist the new model to the session DB so the dashboard # shows the updated model (#34850). diff --git a/tests/gateway/test_model_command_expensive_confirm.py b/tests/gateway/test_model_command_expensive_confirm.py index c78ae3818af..e2ecc72678b 100644 --- a/tests/gateway/test_model_command_expensive_confirm.py +++ b/tests/gateway/test_model_command_expensive_confirm.py @@ -184,3 +184,53 @@ async def test_typed_model_cheap_switches_without_prompt(tmp_path, monkeypatch): assert "gpt-5.5-pro" in result overrides = list(runner._session_model_overrides.values()) assert len(overrides) == 1 + + +@pytest.mark.asyncio +async def test_failed_inplace_swap_aborts_commit(tmp_path, monkeypatch): + """A failed in-place agent swap must be a no-op, not a dead session. + + Regression for #50163: the resolution pipeline succeeds (valid model name) + but the cached agent's ``switch_model()`` raises mid-conversation (bad key / + unreachable URL). The agent rolls itself back to the old working model; the + gateway must NOT then commit the broken model as a session override or evict + the working cached agent — otherwise the next message rebuilds a dead agent + and the conversation is lost. + """ + _setup_isolated_home(tmp_path, monkeypatch, warn=False) + runner = _make_runner() + + # Working cached agent whose in-place swap fails (and rolls itself back). + class _FailingAgent: + def __init__(self): + self.model = "old-model" + self.provider = "openrouter" + + def switch_model(self, **kwargs): + # Mirrors agent_runtime_helpers.switch_model: the real method + # restores old state then re-raises. We keep model unchanged. + raise RuntimeError("connection refused: bad base_url") + + import threading + + agent = _FailingAgent() + runner._agent_cache = {} + runner._agent_cache_lock = threading.Lock() + session_key = runner._session_key_for_source(_make_event("/model x").source) + runner._agent_cache[session_key] = [agent, None] + runner._session_db = None + + evicted = [] + runner._evict_cached_agent = lambda sk: evicted.append(sk) + + result = await runner._handle_model_command(_make_event("/model openai/gpt-5.5-pro")) + + # Error surfaced to the user, not a success confirmation. + assert result is not None + assert "failed" in result.lower() + # The broken switch must NOT have been committed anywhere. + assert runner._session_model_overrides == {} + # The working cached agent must NOT have been evicted. + assert evicted == [] + # The agent stayed on its old model (rolled back). + assert agent.model == "old-model" diff --git a/tui_gateway/server.py b/tui_gateway/server.py index e822855db37..861e60bc743 100644 --- a/tui_gateway/server.py +++ b/tui_gateway/server.py @@ -2344,13 +2344,27 @@ def _apply_model_switch( } if agent: - agent.switch_model( - new_model=result.new_model, - new_provider=result.target_provider, - api_key=result.api_key, - base_url=result.base_url, - api_mode=result.api_mode, - ) + try: + agent.switch_model( + new_model=result.new_model, + new_provider=result.target_provider, + api_key=result.api_key, + base_url=result.base_url, + api_mode=result.api_mode, + ) + except Exception as exc: + # The in-place swap rolled the agent back to the old working + # model/client and re-raised. Abort the commit: do NOT restart the + # slash worker, persist runtime, append the switch marker, set a + # session model_override, or persist to config — all of which would + # otherwise leave the session pinned to a broken model and kill the + # conversation on the next turn (#50163). A failed switch is a + # no-op; surface a clean error to the client. + logger.warning("In-place model switch failed for TUI agent: %s", exc) + raise ValueError( + f"Model switch to {result.new_model} failed ({exc}); " + f"staying on {getattr(agent, 'model', current_model)}." + ) from exc _restart_slash_worker(sid, session) _persist_live_session_runtime(session) _persist_live_session_system_prompt(session)