From d4c2217e87400d73b65d015b83ff7db435b29a4e Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Sat, 27 Jun 2026 04:36:22 -0700
Subject: [PATCH] fix(gateway): offload /model switch off the event loop
 (#53603)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Telegram/Discord /model command's actual switch calls switch_model()
directly on the asyncio event loop. switch_model() can fall through to a
synchronous models.dev HTTP fetch (requests.get, 15s timeout) on a cold or
expired cache, freezing the gateway for up to 15s and dropping the Telegram
connection while a user switches models.

The picker provider-list and fallback text-list sites were already offloaded
(#41289), but the two _switch_model() calls — the picker callback and the
direct /model <name> path — were not. Wrap both in asyncio.to_thread.

Closes #20525.
---
 gateway/slash_commands.py                     | 14 +++++-
 .../test_model_command_custom_providers.py    | 45 +++++++++++++++++++
 2 files changed, 57 insertions(+), 2 deletions(-)
diff --git a/gateway/slash_commands.py b/gateway/slash_commands.py
index 0bcf5457455..7764f114fff 100644
--- a/gateway/slash_commands.py
+++ b/gateway/slash_commands.py
@@ -1228,7 +1228,12 @@ class GatewaySlashCommandsMixin:
                         skew_error = _model_switch_skew_guard()
                         if skew_error:
                             return skew_error
-                        result = _switch_model(
+                        # Offload the switch off the event loop — switch_model()
+                        # can fall through to a synchronous models.dev HTTP fetch
+                        # (requests.get, 15s timeout) on a cold/expired cache,
+                        # which freezes the gateway otherwise. See #20525, #41289.
+                        result = await asyncio.to_thread(
+                            _switch_model,
                             raw_input=model_id,
                             current_provider=_cur_provider,
                             current_model=_cur_model,
@@ -1452,7 +1457,12 @@ class GatewaySlashCommandsMixin:
         skew_error = _model_switch_skew_guard()
         if skew_error:
             return skew_error
-        result = _switch_model(
+        # Offload the switch off the event loop — switch_model() can fall
+        # through to a synchronous models.dev HTTP fetch (requests.get, 15s
+        # timeout) on a cold/expired cache, which freezes the gateway
+        # otherwise. See #20525, #41289.
+        result = await asyncio.to_thread(
+            _switch_model,
             raw_input=model_input,
             current_provider=current_provider,
             current_model=current_model,
diff --git a/tests/gateway/test_model_command_custom_providers.py b/tests/gateway/test_model_command_custom_providers.py
index ed97e527b05..9c4aeafc753 100644
--- a/tests/gateway/test_model_command_custom_providers.py
+++ b/tests/gateway/test_model_command_custom_providers.py
@@ -61,3 +61,48 @@ async def test_handle_model_command_lists_saved_custom_provider(tmp_path, monkey
     assert "Local (127.0.0.1:4141)" in result
     assert "custom:local-(127.0.0.1:4141)" in result
     assert "rotator-openrouter-coding" in result
+
+
+@pytest.mark.asyncio
+async def test_direct_model_switch_offloads_to_thread(tmp_path, monkeypatch):
+    """A direct `/model <name>` switch must route switch_model() through
+    asyncio.to_thread so the blocking models.dev HTTP fetch can't freeze the
+    gateway event loop (#20525)."""
+    import asyncio
+
+    from hermes_cli.model_switch import ModelSwitchResult
+
+    hermes_home = tmp_path / ".hermes"
+    hermes_home.mkdir()
+    (hermes_home / "config.yaml").write_text(
+        yaml.safe_dump(
+            {"model": {"default": "gpt-5.4", "provider": "openrouter"}}
+        ),
+        encoding="utf-8",
+    )
+
+    import gateway.run as gateway_run
+
+    monkeypatch.setattr(gateway_run, "_hermes_home", hermes_home)
+
+    # Fail the switch so the handler returns before _finish_switch (which needs
+    # full runner state) — we only care that the offload happened.
+    def _fake_switch(**kwargs):
+        return ModelSwitchResult(success=False, error_message="nope")
+
+    monkeypatch.setattr("hermes_cli.model_switch.switch_model", _fake_switch)
+
+    offloaded = []
+    real_to_thread = asyncio.to_thread
+
+    async def _spy_to_thread(func, /, *args, **kwargs):
+        offloaded.append(getattr(func, "__name__", repr(func)))
+        return await real_to_thread(func, *args, **kwargs)
+
+    monkeypatch.setattr(asyncio, "to_thread", _spy_to_thread)
+
+    result = await _make_runner()._handle_model_command(_make_event("/model gpt-5.4"))
+
+    # switch_model was offloaded to a worker thread, not run on the event loop.
+    assert "_fake_switch" in offloaded
+    assert result is not None and "nope" in result