fix(model-switch): a failed in-place swap must be a no-op, not a dead session (#50375)

When a /model switch resolves a valid model but the in-place agent swap fails mid-conversation (expired key, unreachable base_url), the agent rolls itself back to the old working model+client and re-raises. The callers caught that re-raise, logged a warning, then committed the broken switch anyway: wrote the failed model to the session DB, set _session_model_overrides to the broken model/provider/key, and (gateway direct path) evicted the working cached agent. The next message then rebuilt a dead agent from the broken override -> permanently unusable conversation (#50163). Fix the whole caller class so a failed swap aborts the commit entirely: - gateway/slash_commands.py (picker + direct /model paths): on swap failure, early-return an error message; skip DB persist, session override, cache eviction, and config write. - cli.py (both /model handlers): snapshot CLI-level credential/runtime fields before mutating, restore them on swap failure, and abort the note + success print. - tui_gateway/server.py: wrap the previously-unguarded swap; on failure raise a clean error and skip worker restart, runtime persist, switch marker, session model_override, and config persist. The no-cached-agent path (apply-on-next-session) is unaffected. Adds a gateway regression test that fails on the pre-fix behavior.
2026-06-23 10:42:00 +00:00 · 2026-06-21 13:33:23 -07:00 · 2026-06-21 13:33:23 -07:00 · 99f3072aa0
commit 99f3072aa0
parent ed3d12a762
4 changed files with 150 additions and 10 deletions
--- a/cli.py
+++ b/cli.py
@ -7054,6 +7054,21 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin):
                logger.debug("preflight-compression switch warning failed: %s", exc)

        old_model = self.model
+        # Snapshot the CLI-level credential/runtime fields BEFORE mutating them
+        # so a failed in-place agent swap can roll the whole CLI back to the old
+        # working model.  Otherwise the broken credentials staged below leak into
+        # the next turn's resolution even though the agent itself rolled back
+        # (#50163).
+        _cli_snapshot = {
+            "model": self.model,
+            "provider": self.provider,
+            "requested_provider": self.requested_provider,
+            "_explicit_api_key": getattr(self, "_explicit_api_key", None),
+            "_explicit_base_url": getattr(self, "_explicit_base_url", None),
+            "api_key": self.api_key,
+            "base_url": self.base_url,
+            "api_mode": self.api_mode,
+        }
        self.model = result.new_model
        self.provider = result.target_provider
        self.requested_provider = result.target_provider
@ -7079,7 +7094,17 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin):
                    api_mode=result.api_mode,
                )
            except Exception as exc:
-                _cprint(f"  ⚠ Agent swap failed ({exc}); change applied to next session.")
+                # The agent rolled itself back to the old working model/client.
+                # Roll the CLI's own staged fields back too and abort the rest
+                # of the commit (note + success print) so a failed switch is a
+                # no-op rather than a dead session (#50163).
+                for _k, _v in _cli_snapshot.items():
+                    setattr(self, _k, _v)
+                _cprint(
+                    f"  ⚠ Model switch to {result.new_model} failed ({exc}); "
+                    f"staying on {old_model}."
+                )
+                return

        self._pending_model_switch_note = (
            f"[Note: model was just switched from {old_model} to {result.new_model} "
@ -7340,6 +7365,18 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin):
        # Update requested_provider so _ensure_runtime_credentials() doesn't
        # overwrite the switch on the next turn (it re-resolves from this).
        old_model = self.model
+        # Snapshot CLI-level fields before mutation so a failed in-place swap
+        # rolls the whole CLI back to the old working model (#50163).
+        _cli_snapshot = {
+            "model": self.model,
+            "provider": self.provider,
+            "requested_provider": self.requested_provider,
+            "_explicit_api_key": getattr(self, "_explicit_api_key", None),
+            "_explicit_base_url": getattr(self, "_explicit_base_url", None),
+            "api_key": self.api_key,
+            "base_url": self.base_url,
+            "api_mode": self.api_mode,
+        }
        self.model = result.new_model
        self.provider = result.target_provider
        self.requested_provider = result.target_provider
@ -7366,7 +7403,15 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin):
                    api_mode=result.api_mode,
                )
            except Exception as exc:
-                _cprint(f"  ⚠ Agent swap failed ({exc}); change applied to next session.")
+                # Agent rolled itself back; roll the CLI back too and abort so a
+                # failed switch is a no-op rather than a dead session (#50163).
+                for _k, _v in _cli_snapshot.items():
+                    setattr(self, _k, _v)
+                _cprint(
+                    f"  ⚠ Model switch to {result.new_model} failed ({exc}); "
+                    f"staying on {old_model}."
+                )
+                return

        # Store a note to prepend to the next user message so the model
        # knows a switch occurred (avoids injecting system messages mid-history
--- a/gateway/slash_commands.py
+++ b/gateway/slash_commands.py
@ -1193,7 +1193,25 @@ class GatewaySlashCommandsMixin:
                                    api_mode=result.api_mode,
                                )
                            except Exception as exc:
-                                logger.warning("Picker model switch failed for cached agent: %s", exc)
+                                # The in-place swap rolled the agent back to the
+                                # OLD working model/client and re-raised.  Abort
+                                # the rest of the commit: do NOT persist the
+                                # failed model to the DB, do NOT set a session
+                                # override pointing at the broken model, and do
+                                # NOT evict the working cached agent.  Otherwise
+                                # the next message rebuilds a dead agent from the
+                                # broken override and the conversation is lost
+                                # (#50163).  A failed switch must be a no-op.
+                                logger.warning(
+                                    "Picker model switch failed for cached agent: %s", exc
+                                )
+                                return t(
+                                    "gateway.model.error_prefix",
+                                    error=(
+                                        f"Model switch to {result.new_model} failed ({exc}); "
+                                        f"staying on {_cur_model}."
+                                    ),
+                                )

                        # Persist the new model to the session DB so the
                        # dashboard shows the updated model (#34850).
@ -1399,7 +1417,20 @@ class GatewaySlashCommandsMixin:
                        api_mode=result.api_mode,
                    )
                except Exception as exc:
+                    # In-place swap rolled the agent back to the OLD working
+                    # model/client and re-raised.  Abort the commit: skip DB
+                    # persist, session override, cache eviction, and config
+                    # write so a failed switch is a no-op rather than a dead
+                    # conversation (#50163).  Without this early return the
+                    # next message rebuilds a broken agent from the override.
                    logger.warning("In-place model switch failed for cached agent: %s", exc)
+                    return t(
+                        "gateway.model.error_prefix",
+                        error=(
+                            f"Model switch to {result.new_model} failed ({exc}); "
+                            f"staying on {current_model}."
+                        ),
+                    )

            # Persist the new model to the session DB so the dashboard
            # shows the updated model (#34850).
--- a/tests/gateway/test_model_command_expensive_confirm.py
+++ b/tests/gateway/test_model_command_expensive_confirm.py
@ -184,3 +184,53 @@ async def test_typed_model_cheap_switches_without_prompt(tmp_path, monkeypatch):
    assert "gpt-5.5-pro" in result
    overrides = list(runner._session_model_overrides.values())
    assert len(overrides) == 1
+
+
+@pytest.mark.asyncio
+async def test_failed_inplace_swap_aborts_commit(tmp_path, monkeypatch):
+    """A failed in-place agent swap must be a no-op, not a dead session.
+
+    Regression for #50163: the resolution pipeline succeeds (valid model name)
+    but the cached agent's ``switch_model()`` raises mid-conversation (bad key /
+    unreachable URL). The agent rolls itself back to the old working model; the
+    gateway must NOT then commit the broken model as a session override or evict
+    the working cached agent — otherwise the next message rebuilds a dead agent
+    and the conversation is lost.
+    """
+    _setup_isolated_home(tmp_path, monkeypatch, warn=False)
+    runner = _make_runner()
+
+    # Working cached agent whose in-place swap fails (and rolls itself back).
+    class _FailingAgent:
+        def __init__(self):
+            self.model = "old-model"
+            self.provider = "openrouter"
+
+        def switch_model(self, **kwargs):
+            # Mirrors agent_runtime_helpers.switch_model: the real method
+            # restores old state then re-raises. We keep model unchanged.
+            raise RuntimeError("connection refused: bad base_url")
+
+    import threading
+
+    agent = _FailingAgent()
+    runner._agent_cache = {}
+    runner._agent_cache_lock = threading.Lock()
+    session_key = runner._session_key_for_source(_make_event("/model x").source)
+    runner._agent_cache[session_key] = [agent, None]
+    runner._session_db = None
+
+    evicted = []
+    runner._evict_cached_agent = lambda sk: evicted.append(sk)
+
+    result = await runner._handle_model_command(_make_event("/model openai/gpt-5.5-pro"))
+
+    # Error surfaced to the user, not a success confirmation.
+    assert result is not None
+    assert "failed" in result.lower()
+    # The broken switch must NOT have been committed anywhere.
+    assert runner._session_model_overrides == {}
+    # The working cached agent must NOT have been evicted.
+    assert evicted == []
+    # The agent stayed on its old model (rolled back).
+    assert agent.model == "old-model"
--- a/tui_gateway/server.py
+++ b/tui_gateway/server.py
@ -2344,13 +2344,27 @@ def _apply_model_switch(
            }

    if agent:
-        agent.switch_model(
-            new_model=result.new_model,
-            new_provider=result.target_provider,
-            api_key=result.api_key,
-            base_url=result.base_url,
-            api_mode=result.api_mode,
-        )
+        try:
+            agent.switch_model(
+                new_model=result.new_model,
+                new_provider=result.target_provider,
+                api_key=result.api_key,
+                base_url=result.base_url,
+                api_mode=result.api_mode,
+            )
+        except Exception as exc:
+            # The in-place swap rolled the agent back to the old working
+            # model/client and re-raised.  Abort the commit: do NOT restart the
+            # slash worker, persist runtime, append the switch marker, set a
+            # session model_override, or persist to config — all of which would
+            # otherwise leave the session pinned to a broken model and kill the
+            # conversation on the next turn (#50163).  A failed switch is a
+            # no-op; surface a clean error to the client.
+            logger.warning("In-place model switch failed for TUI agent: %s", exc)
+            raise ValueError(
+                f"Model switch to {result.new_model} failed ({exc}); "
+                f"staying on {getattr(agent, 'model', current_model)}."
+            ) from exc
        _restart_slash_worker(sid, session)
        _persist_live_session_runtime(session)
        _persist_live_session_system_prompt(session)