From 99f3072aa06ac9a858ea5f1a753a801c15d76d5e Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Sun, 21 Jun 2026 13:33:23 -0700
Subject: [PATCH] fix(model-switch): a failed in-place swap must be a no-op,
 not a dead session (#50375)

When a /model switch resolves a valid model but the in-place agent swap
fails mid-conversation (expired key, unreachable base_url), the agent
rolls itself back to the old working model+client and re-raises. The
callers caught that re-raise, logged a warning, then committed the broken
switch anyway: wrote the failed model to the session DB, set
_session_model_overrides to the broken model/provider/key, and (gateway
direct path) evicted the working cached agent. The next message then
rebuilt a dead agent from the broken override -> permanently unusable
conversation (#50163).

Fix the whole caller class so a failed swap aborts the commit entirely:

- gateway/slash_commands.py (picker + direct /model paths): on swap
  failure, early-return an error message; skip DB persist, session
  override, cache eviction, and config write.
- cli.py (both /model handlers): snapshot CLI-level credential/runtime
  fields before mutating, restore them on swap failure, and abort the
  note + success print.
- tui_gateway/server.py: wrap the previously-unguarded swap; on failure
  raise a clean error and skip worker restart, runtime persist, switch
  marker, session model_override, and config persist.

The no-cached-agent path (apply-on-next-session) is unaffected.

Adds a gateway regression test that fails on the pre-fix behavior.
---
 cli.py                                        | 49 +++++++++++++++++-
 gateway/slash_commands.py                     | 33 +++++++++++-
 .../test_model_command_expensive_confirm.py   | 50 +++++++++++++++++++
 tui_gateway/server.py                         | 28 ++++++++---
 4 files changed, 150 insertions(+), 10 deletions(-)

diff --git a/cli.py b/cli.py
index e15b54b6815..10846775fc2 100644
--- a/cli.py
+++ b/cli.py
@@ -7054,6 +7054,21 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin):
                 logger.debug("preflight-compression switch warning failed: %s", exc)
 
         old_model = self.model
+        # Snapshot the CLI-level credential/runtime fields BEFORE mutating them
+        # so a failed in-place agent swap can roll the whole CLI back to the old
+        # working model.  Otherwise the broken credentials staged below leak into
+        # the next turn's resolution even though the agent itself rolled back
+        # (#50163).
+        _cli_snapshot = {
+            "model": self.model,
+            "provider": self.provider,
+            "requested_provider": self.requested_provider,
+            "_explicit_api_key": getattr(self, "_explicit_api_key", None),
+            "_explicit_base_url": getattr(self, "_explicit_base_url", None),
+            "api_key": self.api_key,
+            "base_url": self.base_url,
+            "api_mode": self.api_mode,
+        }
         self.model = result.new_model
         self.provider = result.target_provider
         self.requested_provider = result.target_provider
@@ -7079,7 +7094,17 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin):
                     api_mode=result.api_mode,
                 )
             except Exception as exc:
-                _cprint(f"  ⚠ Agent swap failed ({exc}); change applied to next session.")
+                # The agent rolled itself back to the old working model/client.
+                # Roll the CLI's own staged fields back too and abort the rest
+                # of the commit (note + success print) so a failed switch is a
+                # no-op rather than a dead session (#50163).
+                for _k, _v in _cli_snapshot.items():
+                    setattr(self, _k, _v)
+                _cprint(
+                    f"  ⚠ Model switch to {result.new_model} failed ({exc}); "
+                    f"staying on {old_model}."
+                )
+                return
 
         self._pending_model_switch_note = (
             f"[Note: model was just switched from {old_model} to {result.new_model} "
@@ -7340,6 +7365,18 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin):
         # Update requested_provider so _ensure_runtime_credentials() doesn't
         # overwrite the switch on the next turn (it re-resolves from this).
         old_model = self.model
+        # Snapshot CLI-level fields before mutation so a failed in-place swap
+        # rolls the whole CLI back to the old working model (#50163).
+        _cli_snapshot = {
+            "model": self.model,
+            "provider": self.provider,
+            "requested_provider": self.requested_provider,
+            "_explicit_api_key": getattr(self, "_explicit_api_key", None),
+            "_explicit_base_url": getattr(self, "_explicit_base_url", None),
+            "api_key": self.api_key,
+            "base_url": self.base_url,
+            "api_mode": self.api_mode,
+        }
         self.model = result.new_model
         self.provider = result.target_provider
         self.requested_provider = result.target_provider
@@ -7366,7 +7403,15 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin):
                     api_mode=result.api_mode,
                 )
             except Exception as exc:
-                _cprint(f"  ⚠ Agent swap failed ({exc}); change applied to next session.")
+                # Agent rolled itself back; roll the CLI back too and abort so a
+                # failed switch is a no-op rather than a dead session (#50163).
+                for _k, _v in _cli_snapshot.items():
+                    setattr(self, _k, _v)
+                _cprint(
+                    f"  ⚠ Model switch to {result.new_model} failed ({exc}); "
+                    f"staying on {old_model}."
+                )
+                return
 
         # Store a note to prepend to the next user message so the model
         # knows a switch occurred (avoids injecting system messages mid-history
diff --git a/gateway/slash_commands.py b/gateway/slash_commands.py
index e5baf8693b2..ca519413a07 100644
--- a/gateway/slash_commands.py
+++ b/gateway/slash_commands.py
@@ -1193,7 +1193,25 @@ class GatewaySlashCommandsMixin:
                                     api_mode=result.api_mode,
                                 )
                             except Exception as exc:
-                                logger.warning("Picker model switch failed for cached agent: %s", exc)
+                                # The in-place swap rolled the agent back to the
+                                # OLD working model/client and re-raised.  Abort
+                                # the rest of the commit: do NOT persist the
+                                # failed model to the DB, do NOT set a session
+                                # override pointing at the broken model, and do
+                                # NOT evict the working cached agent.  Otherwise
+                                # the next message rebuilds a dead agent from the
+                                # broken override and the conversation is lost
+                                # (#50163).  A failed switch must be a no-op.
+                                logger.warning(
+                                    "Picker model switch failed for cached agent: %s", exc
+                                )
+                                return t(
+                                    "gateway.model.error_prefix",
+                                    error=(
+                                        f"Model switch to {result.new_model} failed ({exc}); "
+                                        f"staying on {_cur_model}."
+                                    ),
+                                )
 
                         # Persist the new model to the session DB so the
                         # dashboard shows the updated model (#34850).
@@ -1399,7 +1417,20 @@ class GatewaySlashCommandsMixin:
                         api_mode=result.api_mode,
                     )
                 except Exception as exc:
+                    # In-place swap rolled the agent back to the OLD working
+                    # model/client and re-raised.  Abort the commit: skip DB
+                    # persist, session override, cache eviction, and config
+                    # write so a failed switch is a no-op rather than a dead
+                    # conversation (#50163).  Without this early return the
+                    # next message rebuilds a broken agent from the override.
                     logger.warning("In-place model switch failed for cached agent: %s", exc)
+                    return t(
+                        "gateway.model.error_prefix",
+                        error=(
+                            f"Model switch to {result.new_model} failed ({exc}); "
+                            f"staying on {current_model}."
+                        ),
+                    )
 
             # Persist the new model to the session DB so the dashboard
             # shows the updated model (#34850).
diff --git a/tests/gateway/test_model_command_expensive_confirm.py b/tests/gateway/test_model_command_expensive_confirm.py
index c78ae3818af..e2ecc72678b 100644
--- a/tests/gateway/test_model_command_expensive_confirm.py
+++ b/tests/gateway/test_model_command_expensive_confirm.py
@@ -184,3 +184,53 @@ async def test_typed_model_cheap_switches_without_prompt(tmp_path, monkeypatch):
     assert "gpt-5.5-pro" in result
     overrides = list(runner._session_model_overrides.values())
     assert len(overrides) == 1
+
+
+@pytest.mark.asyncio
+async def test_failed_inplace_swap_aborts_commit(tmp_path, monkeypatch):
+    """A failed in-place agent swap must be a no-op, not a dead session.
+
+    Regression for #50163: the resolution pipeline succeeds (valid model name)
+    but the cached agent's ``switch_model()`` raises mid-conversation (bad key /
+    unreachable URL). The agent rolls itself back to the old working model; the
+    gateway must NOT then commit the broken model as a session override or evict
+    the working cached agent — otherwise the next message rebuilds a dead agent
+    and the conversation is lost.
+    """
+    _setup_isolated_home(tmp_path, monkeypatch, warn=False)
+    runner = _make_runner()
+
+    # Working cached agent whose in-place swap fails (and rolls itself back).
+    class _FailingAgent:
+        def __init__(self):
+            self.model = "old-model"
+            self.provider = "openrouter"
+
+        def switch_model(self, **kwargs):
+            # Mirrors agent_runtime_helpers.switch_model: the real method
+            # restores old state then re-raises. We keep model unchanged.
+            raise RuntimeError("connection refused: bad base_url")
+
+    import threading
+
+    agent = _FailingAgent()
+    runner._agent_cache = {}
+    runner._agent_cache_lock = threading.Lock()
+    session_key = runner._session_key_for_source(_make_event("/model x").source)
+    runner._agent_cache[session_key] = [agent, None]
+    runner._session_db = None
+
+    evicted = []
+    runner._evict_cached_agent = lambda sk: evicted.append(sk)
+
+    result = await runner._handle_model_command(_make_event("/model openai/gpt-5.5-pro"))
+
+    # Error surfaced to the user, not a success confirmation.
+    assert result is not None
+    assert "failed" in result.lower()
+    # The broken switch must NOT have been committed anywhere.
+    assert runner._session_model_overrides == {}
+    # The working cached agent must NOT have been evicted.
+    assert evicted == []
+    # The agent stayed on its old model (rolled back).
+    assert agent.model == "old-model"
diff --git a/tui_gateway/server.py b/tui_gateway/server.py
index e822855db37..861e60bc743 100644
--- a/tui_gateway/server.py
+++ b/tui_gateway/server.py
@@ -2344,13 +2344,27 @@ def _apply_model_switch(
             }
 
     if agent:
-        agent.switch_model(
-            new_model=result.new_model,
-            new_provider=result.target_provider,
-            api_key=result.api_key,
-            base_url=result.base_url,
-            api_mode=result.api_mode,
-        )
+        try:
+            agent.switch_model(
+                new_model=result.new_model,
+                new_provider=result.target_provider,
+                api_key=result.api_key,
+                base_url=result.base_url,
+                api_mode=result.api_mode,
+            )
+        except Exception as exc:
+            # The in-place swap rolled the agent back to the old working
+            # model/client and re-raised.  Abort the commit: do NOT restart the
+            # slash worker, persist runtime, append the switch marker, set a
+            # session model_override, or persist to config — all of which would
+            # otherwise leave the session pinned to a broken model and kill the
+            # conversation on the next turn (#50163).  A failed switch is a
+            # no-op; surface a clean error to the client.
+            logger.warning("In-place model switch failed for TUI agent: %s", exc)
+            raise ValueError(
+                f"Model switch to {result.new_model} failed ({exc}); "
+                f"staying on {getattr(agent, 'model', current_model)}."
+            ) from exc
         _restart_slash_worker(sid, session)
         _persist_live_session_runtime(session)
         _persist_live_session_system_prompt(session)