From d4c2217e87400d73b65d015b83ff7db435b29a4e Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sat, 27 Jun 2026 04:36:22 -0700 Subject: [PATCH] fix(gateway): offload /model switch off the event loop (#53603) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Telegram/Discord /model command's actual switch calls switch_model() directly on the asyncio event loop. switch_model() can fall through to a synchronous models.dev HTTP fetch (requests.get, 15s timeout) on a cold or expired cache, freezing the gateway for up to 15s and dropping the Telegram connection while a user switches models. The picker provider-list and fallback text-list sites were already offloaded (#41289), but the two _switch_model() calls — the picker callback and the direct /model path — were not. Wrap both in asyncio.to_thread. Closes #20525. --- gateway/slash_commands.py | 14 +++++- .../test_model_command_custom_providers.py | 45 +++++++++++++++++++ 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/gateway/slash_commands.py b/gateway/slash_commands.py index 0bcf5457455..7764f114fff 100644 --- a/gateway/slash_commands.py +++ b/gateway/slash_commands.py @@ -1228,7 +1228,12 @@ class GatewaySlashCommandsMixin: skew_error = _model_switch_skew_guard() if skew_error: return skew_error - result = _switch_model( + # Offload the switch off the event loop — switch_model() + # can fall through to a synchronous models.dev HTTP fetch + # (requests.get, 15s timeout) on a cold/expired cache, + # which freezes the gateway otherwise. See #20525, #41289. + result = await asyncio.to_thread( + _switch_model, raw_input=model_id, current_provider=_cur_provider, current_model=_cur_model, @@ -1452,7 +1457,12 @@ class GatewaySlashCommandsMixin: skew_error = _model_switch_skew_guard() if skew_error: return skew_error - result = _switch_model( + # Offload the switch off the event loop — switch_model() can fall + # through to a synchronous models.dev HTTP fetch (requests.get, 15s + # timeout) on a cold/expired cache, which freezes the gateway + # otherwise. See #20525, #41289. + result = await asyncio.to_thread( + _switch_model, raw_input=model_input, current_provider=current_provider, current_model=current_model, diff --git a/tests/gateway/test_model_command_custom_providers.py b/tests/gateway/test_model_command_custom_providers.py index ed97e527b05..9c4aeafc753 100644 --- a/tests/gateway/test_model_command_custom_providers.py +++ b/tests/gateway/test_model_command_custom_providers.py @@ -61,3 +61,48 @@ async def test_handle_model_command_lists_saved_custom_provider(tmp_path, monkey assert "Local (127.0.0.1:4141)" in result assert "custom:local-(127.0.0.1:4141)" in result assert "rotator-openrouter-coding" in result + + +@pytest.mark.asyncio +async def test_direct_model_switch_offloads_to_thread(tmp_path, monkeypatch): + """A direct `/model ` switch must route switch_model() through + asyncio.to_thread so the blocking models.dev HTTP fetch can't freeze the + gateway event loop (#20525).""" + import asyncio + + from hermes_cli.model_switch import ModelSwitchResult + + hermes_home = tmp_path / ".hermes" + hermes_home.mkdir() + (hermes_home / "config.yaml").write_text( + yaml.safe_dump( + {"model": {"default": "gpt-5.4", "provider": "openrouter"}} + ), + encoding="utf-8", + ) + + import gateway.run as gateway_run + + monkeypatch.setattr(gateway_run, "_hermes_home", hermes_home) + + # Fail the switch so the handler returns before _finish_switch (which needs + # full runner state) — we only care that the offload happened. + def _fake_switch(**kwargs): + return ModelSwitchResult(success=False, error_message="nope") + + monkeypatch.setattr("hermes_cli.model_switch.switch_model", _fake_switch) + + offloaded = [] + real_to_thread = asyncio.to_thread + + async def _spy_to_thread(func, /, *args, **kwargs): + offloaded.append(getattr(func, "__name__", repr(func))) + return await real_to_thread(func, *args, **kwargs) + + monkeypatch.setattr(asyncio, "to_thread", _spy_to_thread) + + result = await _make_runner()._handle_model_command(_make_event("/model gpt-5.4")) + + # switch_model was offloaded to a worker thread, not run on the event loop. + assert "_fake_switch" in offloaded + assert result is not None and "nope" in result