fix(model-switch): a failed in-place swap must be a no-op, not a dead session (#50375)

When a /model switch resolves a valid model but the in-place agent swap
fails mid-conversation (expired key, unreachable base_url), the agent
rolls itself back to the old working model+client and re-raises. The
callers caught that re-raise, logged a warning, then committed the broken
switch anyway: wrote the failed model to the session DB, set
_session_model_overrides to the broken model/provider/key, and (gateway
direct path) evicted the working cached agent. The next message then
rebuilt a dead agent from the broken override -> permanently unusable
conversation (#50163).

Fix the whole caller class so a failed swap aborts the commit entirely:

- gateway/slash_commands.py (picker + direct /model paths): on swap
  failure, early-return an error message; skip DB persist, session
  override, cache eviction, and config write.
- cli.py (both /model handlers): snapshot CLI-level credential/runtime
  fields before mutating, restore them on swap failure, and abort the
  note + success print.
- tui_gateway/server.py: wrap the previously-unguarded swap; on failure
  raise a clean error and skip worker restart, runtime persist, switch
  marker, session model_override, and config persist.

The no-cached-agent path (apply-on-next-session) is unaffected.

Adds a gateway regression test that fails on the pre-fix behavior.
This commit is contained in:
Teknium 2026-06-21 13:33:23 -07:00 committed by GitHub
parent ed3d12a762
commit 99f3072aa0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 150 additions and 10 deletions

49
cli.py
View file

@ -7054,6 +7054,21 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin):
logger.debug("preflight-compression switch warning failed: %s", exc)
old_model = self.model
# Snapshot the CLI-level credential/runtime fields BEFORE mutating them
# so a failed in-place agent swap can roll the whole CLI back to the old
# working model. Otherwise the broken credentials staged below leak into
# the next turn's resolution even though the agent itself rolled back
# (#50163).
_cli_snapshot = {
"model": self.model,
"provider": self.provider,
"requested_provider": self.requested_provider,
"_explicit_api_key": getattr(self, "_explicit_api_key", None),
"_explicit_base_url": getattr(self, "_explicit_base_url", None),
"api_key": self.api_key,
"base_url": self.base_url,
"api_mode": self.api_mode,
}
self.model = result.new_model
self.provider = result.target_provider
self.requested_provider = result.target_provider
@ -7079,7 +7094,17 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin):
api_mode=result.api_mode,
)
except Exception as exc:
_cprint(f" ⚠ Agent swap failed ({exc}); change applied to next session.")
# The agent rolled itself back to the old working model/client.
# Roll the CLI's own staged fields back too and abort the rest
# of the commit (note + success print) so a failed switch is a
# no-op rather than a dead session (#50163).
for _k, _v in _cli_snapshot.items():
setattr(self, _k, _v)
_cprint(
f" ⚠ Model switch to {result.new_model} failed ({exc}); "
f"staying on {old_model}."
)
return
self._pending_model_switch_note = (
f"[Note: model was just switched from {old_model} to {result.new_model} "
@ -7340,6 +7365,18 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin):
# Update requested_provider so _ensure_runtime_credentials() doesn't
# overwrite the switch on the next turn (it re-resolves from this).
old_model = self.model
# Snapshot CLI-level fields before mutation so a failed in-place swap
# rolls the whole CLI back to the old working model (#50163).
_cli_snapshot = {
"model": self.model,
"provider": self.provider,
"requested_provider": self.requested_provider,
"_explicit_api_key": getattr(self, "_explicit_api_key", None),
"_explicit_base_url": getattr(self, "_explicit_base_url", None),
"api_key": self.api_key,
"base_url": self.base_url,
"api_mode": self.api_mode,
}
self.model = result.new_model
self.provider = result.target_provider
self.requested_provider = result.target_provider
@ -7366,7 +7403,15 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin):
api_mode=result.api_mode,
)
except Exception as exc:
_cprint(f" ⚠ Agent swap failed ({exc}); change applied to next session.")
# Agent rolled itself back; roll the CLI back too and abort so a
# failed switch is a no-op rather than a dead session (#50163).
for _k, _v in _cli_snapshot.items():
setattr(self, _k, _v)
_cprint(
f" ⚠ Model switch to {result.new_model} failed ({exc}); "
f"staying on {old_model}."
)
return
# Store a note to prepend to the next user message so the model
# knows a switch occurred (avoids injecting system messages mid-history

View file

@ -1193,7 +1193,25 @@ class GatewaySlashCommandsMixin:
api_mode=result.api_mode,
)
except Exception as exc:
logger.warning("Picker model switch failed for cached agent: %s", exc)
# The in-place swap rolled the agent back to the
# OLD working model/client and re-raised. Abort
# the rest of the commit: do NOT persist the
# failed model to the DB, do NOT set a session
# override pointing at the broken model, and do
# NOT evict the working cached agent. Otherwise
# the next message rebuilds a dead agent from the
# broken override and the conversation is lost
# (#50163). A failed switch must be a no-op.
logger.warning(
"Picker model switch failed for cached agent: %s", exc
)
return t(
"gateway.model.error_prefix",
error=(
f"Model switch to {result.new_model} failed ({exc}); "
f"staying on {_cur_model}."
),
)
# Persist the new model to the session DB so the
# dashboard shows the updated model (#34850).
@ -1399,7 +1417,20 @@ class GatewaySlashCommandsMixin:
api_mode=result.api_mode,
)
except Exception as exc:
# In-place swap rolled the agent back to the OLD working
# model/client and re-raised. Abort the commit: skip DB
# persist, session override, cache eviction, and config
# write so a failed switch is a no-op rather than a dead
# conversation (#50163). Without this early return the
# next message rebuilds a broken agent from the override.
logger.warning("In-place model switch failed for cached agent: %s", exc)
return t(
"gateway.model.error_prefix",
error=(
f"Model switch to {result.new_model} failed ({exc}); "
f"staying on {current_model}."
),
)
# Persist the new model to the session DB so the dashboard
# shows the updated model (#34850).

View file

@ -184,3 +184,53 @@ async def test_typed_model_cheap_switches_without_prompt(tmp_path, monkeypatch):
assert "gpt-5.5-pro" in result
overrides = list(runner._session_model_overrides.values())
assert len(overrides) == 1
@pytest.mark.asyncio
async def test_failed_inplace_swap_aborts_commit(tmp_path, monkeypatch):
"""A failed in-place agent swap must be a no-op, not a dead session.
Regression for #50163: the resolution pipeline succeeds (valid model name)
but the cached agent's ``switch_model()`` raises mid-conversation (bad key /
unreachable URL). The agent rolls itself back to the old working model; the
gateway must NOT then commit the broken model as a session override or evict
the working cached agent otherwise the next message rebuilds a dead agent
and the conversation is lost.
"""
_setup_isolated_home(tmp_path, monkeypatch, warn=False)
runner = _make_runner()
# Working cached agent whose in-place swap fails (and rolls itself back).
class _FailingAgent:
def __init__(self):
self.model = "old-model"
self.provider = "openrouter"
def switch_model(self, **kwargs):
# Mirrors agent_runtime_helpers.switch_model: the real method
# restores old state then re-raises. We keep model unchanged.
raise RuntimeError("connection refused: bad base_url")
import threading
agent = _FailingAgent()
runner._agent_cache = {}
runner._agent_cache_lock = threading.Lock()
session_key = runner._session_key_for_source(_make_event("/model x").source)
runner._agent_cache[session_key] = [agent, None]
runner._session_db = None
evicted = []
runner._evict_cached_agent = lambda sk: evicted.append(sk)
result = await runner._handle_model_command(_make_event("/model openai/gpt-5.5-pro"))
# Error surfaced to the user, not a success confirmation.
assert result is not None
assert "failed" in result.lower()
# The broken switch must NOT have been committed anywhere.
assert runner._session_model_overrides == {}
# The working cached agent must NOT have been evicted.
assert evicted == []
# The agent stayed on its old model (rolled back).
assert agent.model == "old-model"

View file

@ -2344,13 +2344,27 @@ def _apply_model_switch(
}
if agent:
agent.switch_model(
new_model=result.new_model,
new_provider=result.target_provider,
api_key=result.api_key,
base_url=result.base_url,
api_mode=result.api_mode,
)
try:
agent.switch_model(
new_model=result.new_model,
new_provider=result.target_provider,
api_key=result.api_key,
base_url=result.base_url,
api_mode=result.api_mode,
)
except Exception as exc:
# The in-place swap rolled the agent back to the old working
# model/client and re-raised. Abort the commit: do NOT restart the
# slash worker, persist runtime, append the switch marker, set a
# session model_override, or persist to config — all of which would
# otherwise leave the session pinned to a broken model and kill the
# conversation on the next turn (#50163). A failed switch is a
# no-op; surface a clean error to the client.
logger.warning("In-place model switch failed for TUI agent: %s", exc)
raise ValueError(
f"Model switch to {result.new_model} failed ({exc}); "
f"staying on {getattr(agent, 'model', current_model)}."
) from exc
_restart_slash_worker(sid, session)
_persist_live_session_runtime(session)
_persist_live_session_system_prompt(session)