mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-23 10:42:00 +00:00
fix(model-switch): a failed in-place swap must be a no-op, not a dead session (#50375)
When a /model switch resolves a valid model but the in-place agent swap fails mid-conversation (expired key, unreachable base_url), the agent rolls itself back to the old working model+client and re-raises. The callers caught that re-raise, logged a warning, then committed the broken switch anyway: wrote the failed model to the session DB, set _session_model_overrides to the broken model/provider/key, and (gateway direct path) evicted the working cached agent. The next message then rebuilt a dead agent from the broken override -> permanently unusable conversation (#50163). Fix the whole caller class so a failed swap aborts the commit entirely: - gateway/slash_commands.py (picker + direct /model paths): on swap failure, early-return an error message; skip DB persist, session override, cache eviction, and config write. - cli.py (both /model handlers): snapshot CLI-level credential/runtime fields before mutating, restore them on swap failure, and abort the note + success print. - tui_gateway/server.py: wrap the previously-unguarded swap; on failure raise a clean error and skip worker restart, runtime persist, switch marker, session model_override, and config persist. The no-cached-agent path (apply-on-next-session) is unaffected. Adds a gateway regression test that fails on the pre-fix behavior.
This commit is contained in:
parent
ed3d12a762
commit
99f3072aa0
4 changed files with 150 additions and 10 deletions
49
cli.py
49
cli.py
|
|
@ -7054,6 +7054,21 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin):
|
|||
logger.debug("preflight-compression switch warning failed: %s", exc)
|
||||
|
||||
old_model = self.model
|
||||
# Snapshot the CLI-level credential/runtime fields BEFORE mutating them
|
||||
# so a failed in-place agent swap can roll the whole CLI back to the old
|
||||
# working model. Otherwise the broken credentials staged below leak into
|
||||
# the next turn's resolution even though the agent itself rolled back
|
||||
# (#50163).
|
||||
_cli_snapshot = {
|
||||
"model": self.model,
|
||||
"provider": self.provider,
|
||||
"requested_provider": self.requested_provider,
|
||||
"_explicit_api_key": getattr(self, "_explicit_api_key", None),
|
||||
"_explicit_base_url": getattr(self, "_explicit_base_url", None),
|
||||
"api_key": self.api_key,
|
||||
"base_url": self.base_url,
|
||||
"api_mode": self.api_mode,
|
||||
}
|
||||
self.model = result.new_model
|
||||
self.provider = result.target_provider
|
||||
self.requested_provider = result.target_provider
|
||||
|
|
@ -7079,7 +7094,17 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin):
|
|||
api_mode=result.api_mode,
|
||||
)
|
||||
except Exception as exc:
|
||||
_cprint(f" ⚠ Agent swap failed ({exc}); change applied to next session.")
|
||||
# The agent rolled itself back to the old working model/client.
|
||||
# Roll the CLI's own staged fields back too and abort the rest
|
||||
# of the commit (note + success print) so a failed switch is a
|
||||
# no-op rather than a dead session (#50163).
|
||||
for _k, _v in _cli_snapshot.items():
|
||||
setattr(self, _k, _v)
|
||||
_cprint(
|
||||
f" ⚠ Model switch to {result.new_model} failed ({exc}); "
|
||||
f"staying on {old_model}."
|
||||
)
|
||||
return
|
||||
|
||||
self._pending_model_switch_note = (
|
||||
f"[Note: model was just switched from {old_model} to {result.new_model} "
|
||||
|
|
@ -7340,6 +7365,18 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin):
|
|||
# Update requested_provider so _ensure_runtime_credentials() doesn't
|
||||
# overwrite the switch on the next turn (it re-resolves from this).
|
||||
old_model = self.model
|
||||
# Snapshot CLI-level fields before mutation so a failed in-place swap
|
||||
# rolls the whole CLI back to the old working model (#50163).
|
||||
_cli_snapshot = {
|
||||
"model": self.model,
|
||||
"provider": self.provider,
|
||||
"requested_provider": self.requested_provider,
|
||||
"_explicit_api_key": getattr(self, "_explicit_api_key", None),
|
||||
"_explicit_base_url": getattr(self, "_explicit_base_url", None),
|
||||
"api_key": self.api_key,
|
||||
"base_url": self.base_url,
|
||||
"api_mode": self.api_mode,
|
||||
}
|
||||
self.model = result.new_model
|
||||
self.provider = result.target_provider
|
||||
self.requested_provider = result.target_provider
|
||||
|
|
@ -7366,7 +7403,15 @@ class HermesCLI(CLIAgentSetupMixin, CLICommandsMixin):
|
|||
api_mode=result.api_mode,
|
||||
)
|
||||
except Exception as exc:
|
||||
_cprint(f" ⚠ Agent swap failed ({exc}); change applied to next session.")
|
||||
# Agent rolled itself back; roll the CLI back too and abort so a
|
||||
# failed switch is a no-op rather than a dead session (#50163).
|
||||
for _k, _v in _cli_snapshot.items():
|
||||
setattr(self, _k, _v)
|
||||
_cprint(
|
||||
f" ⚠ Model switch to {result.new_model} failed ({exc}); "
|
||||
f"staying on {old_model}."
|
||||
)
|
||||
return
|
||||
|
||||
# Store a note to prepend to the next user message so the model
|
||||
# knows a switch occurred (avoids injecting system messages mid-history
|
||||
|
|
|
|||
|
|
@ -1193,7 +1193,25 @@ class GatewaySlashCommandsMixin:
|
|||
api_mode=result.api_mode,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("Picker model switch failed for cached agent: %s", exc)
|
||||
# The in-place swap rolled the agent back to the
|
||||
# OLD working model/client and re-raised. Abort
|
||||
# the rest of the commit: do NOT persist the
|
||||
# failed model to the DB, do NOT set a session
|
||||
# override pointing at the broken model, and do
|
||||
# NOT evict the working cached agent. Otherwise
|
||||
# the next message rebuilds a dead agent from the
|
||||
# broken override and the conversation is lost
|
||||
# (#50163). A failed switch must be a no-op.
|
||||
logger.warning(
|
||||
"Picker model switch failed for cached agent: %s", exc
|
||||
)
|
||||
return t(
|
||||
"gateway.model.error_prefix",
|
||||
error=(
|
||||
f"Model switch to {result.new_model} failed ({exc}); "
|
||||
f"staying on {_cur_model}."
|
||||
),
|
||||
)
|
||||
|
||||
# Persist the new model to the session DB so the
|
||||
# dashboard shows the updated model (#34850).
|
||||
|
|
@ -1399,7 +1417,20 @@ class GatewaySlashCommandsMixin:
|
|||
api_mode=result.api_mode,
|
||||
)
|
||||
except Exception as exc:
|
||||
# In-place swap rolled the agent back to the OLD working
|
||||
# model/client and re-raised. Abort the commit: skip DB
|
||||
# persist, session override, cache eviction, and config
|
||||
# write so a failed switch is a no-op rather than a dead
|
||||
# conversation (#50163). Without this early return the
|
||||
# next message rebuilds a broken agent from the override.
|
||||
logger.warning("In-place model switch failed for cached agent: %s", exc)
|
||||
return t(
|
||||
"gateway.model.error_prefix",
|
||||
error=(
|
||||
f"Model switch to {result.new_model} failed ({exc}); "
|
||||
f"staying on {current_model}."
|
||||
),
|
||||
)
|
||||
|
||||
# Persist the new model to the session DB so the dashboard
|
||||
# shows the updated model (#34850).
|
||||
|
|
|
|||
|
|
@ -184,3 +184,53 @@ async def test_typed_model_cheap_switches_without_prompt(tmp_path, monkeypatch):
|
|||
assert "gpt-5.5-pro" in result
|
||||
overrides = list(runner._session_model_overrides.values())
|
||||
assert len(overrides) == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_failed_inplace_swap_aborts_commit(tmp_path, monkeypatch):
|
||||
"""A failed in-place agent swap must be a no-op, not a dead session.
|
||||
|
||||
Regression for #50163: the resolution pipeline succeeds (valid model name)
|
||||
but the cached agent's ``switch_model()`` raises mid-conversation (bad key /
|
||||
unreachable URL). The agent rolls itself back to the old working model; the
|
||||
gateway must NOT then commit the broken model as a session override or evict
|
||||
the working cached agent — otherwise the next message rebuilds a dead agent
|
||||
and the conversation is lost.
|
||||
"""
|
||||
_setup_isolated_home(tmp_path, monkeypatch, warn=False)
|
||||
runner = _make_runner()
|
||||
|
||||
# Working cached agent whose in-place swap fails (and rolls itself back).
|
||||
class _FailingAgent:
|
||||
def __init__(self):
|
||||
self.model = "old-model"
|
||||
self.provider = "openrouter"
|
||||
|
||||
def switch_model(self, **kwargs):
|
||||
# Mirrors agent_runtime_helpers.switch_model: the real method
|
||||
# restores old state then re-raises. We keep model unchanged.
|
||||
raise RuntimeError("connection refused: bad base_url")
|
||||
|
||||
import threading
|
||||
|
||||
agent = _FailingAgent()
|
||||
runner._agent_cache = {}
|
||||
runner._agent_cache_lock = threading.Lock()
|
||||
session_key = runner._session_key_for_source(_make_event("/model x").source)
|
||||
runner._agent_cache[session_key] = [agent, None]
|
||||
runner._session_db = None
|
||||
|
||||
evicted = []
|
||||
runner._evict_cached_agent = lambda sk: evicted.append(sk)
|
||||
|
||||
result = await runner._handle_model_command(_make_event("/model openai/gpt-5.5-pro"))
|
||||
|
||||
# Error surfaced to the user, not a success confirmation.
|
||||
assert result is not None
|
||||
assert "failed" in result.lower()
|
||||
# The broken switch must NOT have been committed anywhere.
|
||||
assert runner._session_model_overrides == {}
|
||||
# The working cached agent must NOT have been evicted.
|
||||
assert evicted == []
|
||||
# The agent stayed on its old model (rolled back).
|
||||
assert agent.model == "old-model"
|
||||
|
|
|
|||
|
|
@ -2344,13 +2344,27 @@ def _apply_model_switch(
|
|||
}
|
||||
|
||||
if agent:
|
||||
agent.switch_model(
|
||||
new_model=result.new_model,
|
||||
new_provider=result.target_provider,
|
||||
api_key=result.api_key,
|
||||
base_url=result.base_url,
|
||||
api_mode=result.api_mode,
|
||||
)
|
||||
try:
|
||||
agent.switch_model(
|
||||
new_model=result.new_model,
|
||||
new_provider=result.target_provider,
|
||||
api_key=result.api_key,
|
||||
base_url=result.base_url,
|
||||
api_mode=result.api_mode,
|
||||
)
|
||||
except Exception as exc:
|
||||
# The in-place swap rolled the agent back to the old working
|
||||
# model/client and re-raised. Abort the commit: do NOT restart the
|
||||
# slash worker, persist runtime, append the switch marker, set a
|
||||
# session model_override, or persist to config — all of which would
|
||||
# otherwise leave the session pinned to a broken model and kill the
|
||||
# conversation on the next turn (#50163). A failed switch is a
|
||||
# no-op; surface a clean error to the client.
|
||||
logger.warning("In-place model switch failed for TUI agent: %s", exc)
|
||||
raise ValueError(
|
||||
f"Model switch to {result.new_model} failed ({exc}); "
|
||||
f"staying on {getattr(agent, 'model', current_model)}."
|
||||
) from exc
|
||||
_restart_slash_worker(sid, session)
|
||||
_persist_live_session_runtime(session)
|
||||
_persist_live_session_system_prompt(session)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue