diff --git a/agent/smart_model_routing.py b/agent/smart_model_routing.py deleted file mode 100644 index 6d482be27..000000000 --- a/agent/smart_model_routing.py +++ /dev/null @@ -1,195 +0,0 @@ -"""Helpers for optional cheap-vs-strong model routing.""" - -from __future__ import annotations - -import os -import re -from typing import Any, Dict, Optional - -from utils import is_truthy_value - -_COMPLEX_KEYWORDS = { - "debug", - "debugging", - "implement", - "implementation", - "refactor", - "patch", - "traceback", - "stacktrace", - "exception", - "error", - "analyze", - "analysis", - "investigate", - "architecture", - "design", - "compare", - "benchmark", - "optimize", - "optimise", - "review", - "terminal", - "shell", - "tool", - "tools", - "pytest", - "test", - "tests", - "plan", - "planning", - "delegate", - "subagent", - "cron", - "docker", - "kubernetes", -} - -_URL_RE = re.compile(r"https?://|www\.", re.IGNORECASE) - - -def _coerce_bool(value: Any, default: bool = False) -> bool: - return is_truthy_value(value, default=default) - - -def _coerce_int(value: Any, default: int) -> int: - try: - return int(value) - except (TypeError, ValueError): - return default - - -def choose_cheap_model_route(user_message: str, routing_config: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]: - """Return the configured cheap-model route when a message looks simple. - - Conservative by design: if the message has signs of code/tool/debugging/ - long-form work, keep the primary model. - """ - cfg = routing_config or {} - if not _coerce_bool(cfg.get("enabled"), False): - return None - - cheap_model = cfg.get("cheap_model") or {} - if not isinstance(cheap_model, dict): - return None - provider = str(cheap_model.get("provider") or "").strip().lower() - model = str(cheap_model.get("model") or "").strip() - if not provider or not model: - return None - - text = (user_message or "").strip() - if not text: - return None - - max_chars = _coerce_int(cfg.get("max_simple_chars"), 160) - max_words = _coerce_int(cfg.get("max_simple_words"), 28) - - if len(text) > max_chars: - return None - if len(text.split()) > max_words: - return None - if text.count("\n") > 1: - return None - if "```" in text or "`" in text: - return None - if _URL_RE.search(text): - return None - - lowered = text.lower() - words = {token.strip(".,:;!?()[]{}\"'`") for token in lowered.split()} - if words & _COMPLEX_KEYWORDS: - return None - - route = dict(cheap_model) - route["provider"] = provider - route["model"] = model - route["routing_reason"] = "simple_turn" - return route - - -def resolve_turn_route(user_message: str, routing_config: Optional[Dict[str, Any]], primary: Dict[str, Any]) -> Dict[str, Any]: - """Resolve the effective model/runtime for one turn. - - Returns a dict with model/runtime/signature/label fields. - """ - route = choose_cheap_model_route(user_message, routing_config) - if not route: - return { - "model": primary.get("model"), - "runtime": { - "api_key": primary.get("api_key"), - "base_url": primary.get("base_url"), - "provider": primary.get("provider"), - "api_mode": primary.get("api_mode"), - "command": primary.get("command"), - "args": list(primary.get("args") or []), - "credential_pool": primary.get("credential_pool"), - }, - "label": None, - "signature": ( - primary.get("model"), - primary.get("provider"), - primary.get("base_url"), - primary.get("api_mode"), - primary.get("command"), - tuple(primary.get("args") or ()), - ), - } - - from hermes_cli.runtime_provider import resolve_runtime_provider - - explicit_api_key = None - api_key_env = str(route.get("api_key_env") or "").strip() - if api_key_env: - explicit_api_key = os.getenv(api_key_env) or None - - try: - runtime = resolve_runtime_provider( - requested=route.get("provider"), - explicit_api_key=explicit_api_key, - explicit_base_url=route.get("base_url"), - ) - except Exception: - return { - "model": primary.get("model"), - "runtime": { - "api_key": primary.get("api_key"), - "base_url": primary.get("base_url"), - "provider": primary.get("provider"), - "api_mode": primary.get("api_mode"), - "command": primary.get("command"), - "args": list(primary.get("args") or []), - "credential_pool": primary.get("credential_pool"), - }, - "label": None, - "signature": ( - primary.get("model"), - primary.get("provider"), - primary.get("base_url"), - primary.get("api_mode"), - primary.get("command"), - tuple(primary.get("args") or ()), - ), - } - - return { - "model": route.get("model"), - "runtime": { - "api_key": runtime.get("api_key"), - "base_url": runtime.get("base_url"), - "provider": runtime.get("provider"), - "api_mode": runtime.get("api_mode"), - "command": runtime.get("command"), - "args": list(runtime.get("args") or []), - "credential_pool": runtime.get("credential_pool"), - }, - "label": f"smart route → {route.get('model')} ({runtime.get('provider')})", - "signature": ( - route.get("model"), - runtime.get("provider"), - runtime.get("base_url"), - runtime.get("api_mode"), - runtime.get("command"), - tuple(runtime.get("args") or ()), - ), - } diff --git a/cli-config.yaml.example b/cli-config.yaml.example index 6bb422ae0..8e4ef3426 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -114,20 +114,6 @@ model: # # Data policy: "allow" (default) or "deny" to exclude providers that may store data # # data_collection: "deny" -# ============================================================================= -# Smart Model Routing (optional) -# ============================================================================= -# Use a cheaper model for short/simple turns while keeping your main model for -# more complex requests. Disabled by default. -# -# smart_model_routing: -# enabled: true -# max_simple_chars: 160 -# max_simple_words: 28 -# cheap_model: -# provider: openrouter -# model: google/gemini-2.5-flash - # ============================================================================= # Git Worktree Isolation # ============================================================================= diff --git a/cli.py b/cli.py index 0e5e9ff66..3b1ecd8ae 100644 --- a/cli.py +++ b/cli.py @@ -310,12 +310,6 @@ def load_cli_config() -> Dict[str, Any]: "enabled": True, # Auto-compress when approaching context limit "threshold": 0.50, # Compress at 50% of model's context limit }, - "smart_model_routing": { - "enabled": False, - "max_simple_chars": 160, - "max_simple_words": 28, - "cheap_model": {}, - }, "agent": { "max_turns": 90, # Default max tool-calling iterations (shared with subagents) "verbose": False, @@ -1857,8 +1851,9 @@ class HermesCLI: fb = [fb] if fb.get("provider") and fb.get("model") else [] self._fallback_model = fb - # Optional cheap-vs-strong routing for simple turns - self._smart_model_routing = CLI_CONFIG.get("smart_model_routing", {}) or {} + # Signature of the currently-initialised agent's runtime. Used to + # rebuild the agent when provider / model / base_url changes across + # turns (e.g. after /model or credential rotation). self._active_agent_route_signature = None # Agent will be initialized on first use @@ -2883,24 +2878,36 @@ class HermesCLI: return True def _resolve_turn_agent_config(self, user_message: str) -> dict: - """Resolve model/runtime overrides for a single user turn.""" - from agent.smart_model_routing import resolve_turn_route + """Build the effective model/runtime config for a single user turn. + + Always uses the session's primary model/provider. If the user has + toggled `/fast` on and the current model supports Priority + Processing / Anthropic fast mode, attach `request_overrides` so the + API call is marked accordingly. + """ from hermes_cli.models import resolve_fast_mode_overrides - route = resolve_turn_route( - user_message, - self._smart_model_routing, - { - "model": self.model, - "api_key": self.api_key, - "base_url": self.base_url, - "provider": self.provider, - "api_mode": self.api_mode, - "command": self.acp_command, - "args": list(self.acp_args or []), - "credential_pool": getattr(self, "_credential_pool", None), - }, - ) + runtime = { + "api_key": self.api_key, + "base_url": self.base_url, + "provider": self.provider, + "api_mode": self.api_mode, + "command": self.acp_command, + "args": list(self.acp_args or []), + "credential_pool": getattr(self, "_credential_pool", None), + } + route = { + "model": self.model, + "runtime": runtime, + "signature": ( + self.model, + runtime["provider"], + runtime["base_url"], + runtime["api_mode"], + runtime["command"], + tuple(runtime["args"]), + ), + } service_tier = getattr(self, "service_tier", None) if not service_tier: @@ -2908,13 +2915,13 @@ class HermesCLI: return route try: - overrides = resolve_fast_mode_overrides(route.get("model")) + overrides = resolve_fast_mode_overrides(route["model"]) except Exception: overrides = None route["request_overrides"] = overrides return route - def _init_agent(self, *, model_override: str = None, runtime_override: dict = None, route_label: str = None, request_overrides: dict | None = None) -> bool: + def _init_agent(self, *, model_override: str = None, runtime_override: dict = None, request_overrides: dict | None = None) -> bool: """ Initialize the agent on first use. When resuming a session, restores conversation history from SQLite. @@ -7911,7 +7918,6 @@ class HermesCLI: if not self._init_agent( model_override=turn_route["model"], runtime_override=turn_route["runtime"], - route_label=turn_route["label"], request_overrides=turn_route.get("request_overrides"), ): return None @@ -10535,7 +10541,6 @@ def main( if cli._init_agent( model_override=turn_route["model"], runtime_override=turn_route["runtime"], - route_label=turn_route["label"], request_overrides=turn_route.get("request_overrides"), ): cli.agent.quiet_mode = True diff --git a/cron/scheduler.py b/cron/scheduler.py index 6e93fc02f..ebeb29dd4 100644 --- a/cron/scheduler.py +++ b/cron/scheduler.py @@ -826,7 +826,6 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]: # Provider routing pr = _cfg.get("provider_routing", {}) - smart_routing = _cfg.get("smart_model_routing", {}) or {} from hermes_cli.runtime_provider import ( resolve_runtime_provider, @@ -843,24 +842,9 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]: message = format_runtime_provider_error(exc) raise RuntimeError(message) from exc - from agent.smart_model_routing import resolve_turn_route - turn_route = resolve_turn_route( - prompt, - smart_routing, - { - "model": model, - "api_key": runtime.get("api_key"), - "base_url": runtime.get("base_url"), - "provider": runtime.get("provider"), - "api_mode": runtime.get("api_mode"), - "command": runtime.get("command"), - "args": list(runtime.get("args") or []), - }, - ) - fallback_model = _cfg.get("fallback_providers") or _cfg.get("fallback_model") or None credential_pool = None - runtime_provider = str(turn_route["runtime"].get("provider") or "").strip().lower() + runtime_provider = str(runtime.get("provider") or "").strip().lower() if runtime_provider: try: from agent.credential_pool import load_pool @@ -877,13 +861,13 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]: logger.debug("Job '%s': failed to load credential pool for %s: %s", job_id, runtime_provider, e) agent = AIAgent( - model=turn_route["model"], - api_key=turn_route["runtime"].get("api_key"), - base_url=turn_route["runtime"].get("base_url"), - provider=turn_route["runtime"].get("provider"), - api_mode=turn_route["runtime"].get("api_mode"), - acp_command=turn_route["runtime"].get("command"), - acp_args=turn_route["runtime"].get("args"), + model=model, + api_key=runtime.get("api_key"), + base_url=runtime.get("base_url"), + provider=runtime.get("provider"), + api_mode=runtime.get("api_mode"), + acp_command=runtime.get("command"), + acp_args=runtime.get("args"), max_iterations=max_iterations, reasoning_config=reasoning_config, prefill_messages=prefill_messages, diff --git a/gateway/run.py b/gateway/run.py index 60c57495b..3b3ee38fe 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -629,7 +629,6 @@ class GatewayRunner: self._restart_drain_timeout = self._load_restart_drain_timeout() self._provider_routing = self._load_provider_routing() self._fallback_model = self._load_fallback_model() - self._smart_model_routing = self._load_smart_model_routing() # Wire process registry into session store for reset protection from tools.process_registry import process_registry @@ -1082,11 +1081,16 @@ class GatewayRunner: return model, runtime_kwargs def _resolve_turn_agent_config(self, user_message: str, model: str, runtime_kwargs: dict) -> dict: - from agent.smart_model_routing import resolve_turn_route + """Build the effective model/runtime config for a single turn. + + Always uses the session's primary model/provider. If `/fast` is + enabled and the model supports Priority Processing / Anthropic fast + mode, attach `request_overrides` so the API call is marked + accordingly. + """ from hermes_cli.models import resolve_fast_mode_overrides - primary = { - "model": model, + runtime = { "api_key": runtime_kwargs.get("api_key"), "base_url": runtime_kwargs.get("base_url"), "provider": runtime_kwargs.get("provider"), @@ -1095,7 +1099,18 @@ class GatewayRunner: "args": list(runtime_kwargs.get("args") or []), "credential_pool": runtime_kwargs.get("credential_pool"), } - route = resolve_turn_route(user_message, getattr(self, "_smart_model_routing", {}), primary) + route = { + "model": model, + "runtime": runtime, + "signature": ( + model, + runtime["provider"], + runtime["base_url"], + runtime["api_mode"], + runtime["command"], + tuple(runtime["args"]), + ), + } service_tier = getattr(self, "_service_tier", None) if not service_tier: @@ -1103,7 +1118,7 @@ class GatewayRunner: return route try: - overrides = resolve_fast_mode_overrides(route.get("model")) + overrides = resolve_fast_mode_overrides(route["model"]) except Exception: overrides = None route["request_overrides"] = overrides @@ -1461,20 +1476,6 @@ class GatewayRunner: pass return None - @staticmethod - def _load_smart_model_routing() -> dict: - """Load optional smart cheap-vs-strong model routing config.""" - try: - import yaml as _y - cfg_path = _hermes_home / "config.yaml" - if cfg_path.exists(): - with open(cfg_path, encoding="utf-8") as _f: - cfg = _y.safe_load(_f) or {} - return cfg.get("smart_model_routing", {}) or {} - except Exception: - pass - return {} - def _snapshot_running_agents(self) -> Dict[str, Any]: return { session_key: agent diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 9040eac0b..147194b62 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -474,13 +474,6 @@ DEFAULT_CONFIG = { }, }, - "smart_model_routing": { - "enabled": False, - "max_simple_chars": 160, - "max_simple_words": 28, - "cheap_model": {}, - }, - # Auxiliary model config — provider:model for each side task. # Format: provider is the provider name, model is the model slug. # "auto" for provider = auto-detect best available provider. @@ -2878,19 +2871,6 @@ _FALLBACK_COMMENT = """ # fallback_model: # provider: openrouter # model: anthropic/claude-sonnet-4 -# -# ── Smart Model Routing ──────────────────────────────────────────────── -# Optional cheap-vs-strong routing for simple turns. -# Keeps the primary model for complex work, but can route short/simple -# messages to a cheaper model across providers. -# -# smart_model_routing: -# enabled: true -# max_simple_chars: 160 -# max_simple_words: 28 -# cheap_model: -# provider: openrouter -# model: google/gemini-2.5-flash """ @@ -2922,19 +2902,6 @@ _COMMENTED_SECTIONS = """ # fallback_model: # provider: openrouter # model: anthropic/claude-sonnet-4 -# -# ── Smart Model Routing ──────────────────────────────────────────────── -# Optional cheap-vs-strong routing for simple turns. -# Keeps the primary model for complex work, but can route short/simple -# messages to a cheaper model across providers. -# -# smart_model_routing: -# enabled: true -# max_simple_chars: 160 -# max_simple_words: 28 -# cheap_model: -# provider: openrouter -# model: google/gemini-2.5-flash """ diff --git a/hermes_cli/dump.py b/hermes_cli/dump.py index f3a174e71..90364a261 100644 --- a/hermes_cli/dump.py +++ b/hermes_cli/dump.py @@ -160,7 +160,6 @@ def _config_overrides(config: dict) -> dict[str, str]: ("display", "streaming"), ("display", "skin"), ("display", "show_reasoning"), - ("smart_model_routing", "enabled"), ("privacy", "redact_pii"), ("tts", "provider"), ] diff --git a/hermes_cli/tips.py b/hermes_cli/tips.py index aa6cb9729..77c2b2405 100644 --- a/hermes_cli/tips.py +++ b/hermes_cli/tips.py @@ -323,7 +323,6 @@ TIPS = [ "GPT-5 and Codex use 'developer' role instead of 'system' in the message format.", "Per-task auxiliary overrides: auxiliary.vision.provider, auxiliary.compression.model, etc. in config.yaml.", "The auxiliary client treats 'main' as a provider alias — resolves to your actual primary provider + model.", - "Smart routing can auto-route simple queries to a cheaper model — set smart_model_routing.enabled: true.", "hermes claw migrate --dry-run previews OpenClaw migration without writing anything.", "File paths pasted with quotes or escaped spaces are handled automatically — no manual cleanup needed.", "Slash commands never trigger the large-paste collapse — /command with big arguments works correctly.", diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index 2c42bfd9c..50f6ff672 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -232,7 +232,6 @@ _CATEGORY_MERGE: Dict[str, str] = { "checkpoints": "agent", "approvals": "security", "human_delay": "display", - "smart_model_routing": "agent", "dashboard": "display", "code_execution": "agent", } diff --git a/skills/autonomous-ai-agents/hermes-agent/SKILL.md b/skills/autonomous-ai-agents/hermes-agent/SKILL.md index 362841f39..d19471c80 100644 --- a/skills/autonomous-ai-agents/hermes-agent/SKILL.md +++ b/skills/autonomous-ai-agents/hermes-agent/SKILL.md @@ -338,7 +338,6 @@ Edit with `hermes config edit` or `hermes config set section.key value`. | `memory` | `memory_enabled`, `user_profile_enabled`, `provider` | | `security` | `tirith_enabled`, `website_blocklist` | | `delegation` | `model`, `provider`, `base_url`, `api_key`, `max_iterations` (50), `reasoning_effort` | -| `smart_model_routing` | `enabled`, `cheap_model` | | `checkpoints` | `enabled`, `max_snapshots` (50) | Full config reference: https://hermes-agent.nousresearch.com/docs/user-guide/configuration diff --git a/tests/agent/test_credential_pool_routing.py b/tests/agent/test_credential_pool_routing.py index 38f5c6dfd..8477fdb64 100644 --- a/tests/agent/test_credential_pool_routing.py +++ b/tests/agent/test_credential_pool_routing.py @@ -1,129 +1,25 @@ -"""Tests for credential pool preservation through smart routing and 429 recovery. +"""Tests for credential pool preservation through turn config and 429 recovery. Covers: -1. credential_pool flows through resolve_turn_route (no-route and fallback paths) -2. CLI _resolve_turn_agent_config passes credential_pool to primary dict -3. Gateway _resolve_turn_agent_config passes credential_pool to primary dict -4. Eager fallback deferred when credential pool has credentials -5. Eager fallback fires when no credential pool exists -6. Full 429 rotation cycle: retry-same → rotate → exhaust → fallback +1. CLI _resolve_turn_agent_config passes credential_pool to runtime dict +2. Gateway _resolve_turn_agent_config passes credential_pool to runtime dict +3. Eager fallback deferred when credential pool has credentials +4. Eager fallback fires when no credential pool exists +5. Full 429 rotation cycle: retry-same → rotate → exhaust → fallback """ -import os -import time from types import SimpleNamespace -from unittest.mock import MagicMock, patch, PropertyMock - -import pytest +from unittest.mock import MagicMock, patch # --------------------------------------------------------------------------- -# 1. smart_model_routing: credential_pool preserved in no-route path -# --------------------------------------------------------------------------- - -class TestSmartRoutingPoolPreservation: - def test_no_route_preserves_credential_pool(self): - from agent.smart_model_routing import resolve_turn_route - - fake_pool = MagicMock(name="CredentialPool") - primary = { - "model": "gpt-5.4", - "api_key": "sk-test", - "base_url": None, - "provider": "openai-codex", - "api_mode": "codex_responses", - "command": None, - "args": [], - "credential_pool": fake_pool, - } - # routing disabled - result = resolve_turn_route("hello", None, primary) - assert result["runtime"]["credential_pool"] is fake_pool - - def test_no_route_none_pool(self): - from agent.smart_model_routing import resolve_turn_route - - primary = { - "model": "gpt-5.4", - "api_key": "sk-test", - "base_url": None, - "provider": "openai-codex", - "api_mode": "codex_responses", - "command": None, - "args": [], - } - result = resolve_turn_route("hello", None, primary) - assert result["runtime"]["credential_pool"] is None - - def test_routing_disabled_preserves_pool(self): - from agent.smart_model_routing import resolve_turn_route - - fake_pool = MagicMock(name="CredentialPool") - primary = { - "model": "gpt-5.4", - "api_key": "sk-test", - "base_url": None, - "provider": "openai-codex", - "api_mode": "codex_responses", - "command": None, - "args": [], - "credential_pool": fake_pool, - } - # routing explicitly disabled - result = resolve_turn_route("hello", {"enabled": False}, primary) - assert result["runtime"]["credential_pool"] is fake_pool - - def test_route_fallback_on_resolve_error_preserves_pool(self, monkeypatch): - """When smart routing picks a cheap model but resolve_runtime_provider - fails, the fallback to primary must still include credential_pool.""" - from agent.smart_model_routing import resolve_turn_route - - fake_pool = MagicMock(name="CredentialPool") - primary = { - "model": "gpt-5.4", - "api_key": "sk-test", - "base_url": None, - "provider": "openai-codex", - "api_mode": "codex_responses", - "command": None, - "args": [], - "credential_pool": fake_pool, - } - routing_config = { - "enabled": True, - "cheap_model": "openai/gpt-4.1-mini", - "cheap_provider": "openrouter", - "max_tokens": 200, - "patterns": ["^(hi|hello|hey)"], - } - # Force resolve_runtime_provider to fail so it falls back to primary - monkeypatch.setattr( - "hermes_cli.runtime_provider.resolve_runtime_provider", - MagicMock(side_effect=RuntimeError("no credentials")), - ) - result = resolve_turn_route("hi", routing_config, primary) - assert result["runtime"]["credential_pool"] is fake_pool - - -# --------------------------------------------------------------------------- -# 2 & 3. CLI and Gateway _resolve_turn_agent_config include credential_pool +# 1. CLI _resolve_turn_agent_config includes credential_pool # --------------------------------------------------------------------------- class TestCliTurnRoutePool: - def test_resolve_turn_includes_pool(self, monkeypatch, tmp_path): - """CLI's _resolve_turn_agent_config must pass credential_pool to primary.""" - from agent.smart_model_routing import resolve_turn_route - captured = {} - - def spy_resolve(user_message, routing_config, primary): - captured["primary"] = primary - return resolve_turn_route(user_message, routing_config, primary) - - monkeypatch.setattr( - "agent.smart_model_routing.resolve_turn_route", spy_resolve - ) - - # Build a minimal HermesCLI-like object with the method + def test_resolve_turn_includes_pool(self): + """CLI's _resolve_turn_agent_config must pass credential_pool in runtime.""" + fake_pool = MagicMock(name="FakePool") shell = SimpleNamespace( model="gpt-5.4", api_key="sk-test", @@ -132,58 +28,46 @@ class TestCliTurnRoutePool: api_mode="codex_responses", acp_command=None, acp_args=[], - _credential_pool=MagicMock(name="FakePool"), - _smart_model_routing={"enabled": False}, + _credential_pool=fake_pool, + service_tier=None, ) - # Import and bind the real method from cli import HermesCLI bound = HermesCLI._resolve_turn_agent_config.__get__(shell) - bound("test message") + route = bound("test message") - assert "credential_pool" in captured["primary"] - assert captured["primary"]["credential_pool"] is shell._credential_pool + assert route["runtime"]["credential_pool"] is fake_pool +# --------------------------------------------------------------------------- +# 2. Gateway _resolve_turn_agent_config includes credential_pool +# --------------------------------------------------------------------------- + class TestGatewayTurnRoutePool: - def test_resolve_turn_includes_pool(self, monkeypatch): + def test_resolve_turn_includes_pool(self): """Gateway's _resolve_turn_agent_config must pass credential_pool.""" - from agent.smart_model_routing import resolve_turn_route - captured = {} - - def spy_resolve(user_message, routing_config, primary): - captured["primary"] = primary - return resolve_turn_route(user_message, routing_config, primary) - - monkeypatch.setattr( - "agent.smart_model_routing.resolve_turn_route", spy_resolve - ) - from gateway.run import GatewayRunner - runner = SimpleNamespace( - _smart_model_routing={"enabled": False}, - ) - + fake_pool = MagicMock(name="FakePool") + runner = SimpleNamespace(_service_tier=None) runtime_kwargs = { - "api_key": "sk-test", + "api_key": "***", "base_url": None, "provider": "openai-codex", "api_mode": "codex_responses", "command": None, "args": [], - "credential_pool": MagicMock(name="FakePool"), + "credential_pool": fake_pool, } bound = GatewayRunner._resolve_turn_agent_config.__get__(runner) - bound("test message", "gpt-5.4", runtime_kwargs) + route = bound("test message", "gpt-5.4", runtime_kwargs) - assert "credential_pool" in captured["primary"] - assert captured["primary"]["credential_pool"] is runtime_kwargs["credential_pool"] + assert route["runtime"]["credential_pool"] is fake_pool # --------------------------------------------------------------------------- -# 4 & 5. Eager fallback deferred/fires based on credential pool +# 3 & 4. Eager fallback deferred/fires based on credential pool # --------------------------------------------------------------------------- class TestEagerFallbackWithPool: @@ -251,7 +135,7 @@ class TestEagerFallbackWithPool: # --------------------------------------------------------------------------- -# 6. Full 429 rotation cycle via _recover_with_credential_pool +# 5. Full 429 rotation cycle via _recover_with_credential_pool # --------------------------------------------------------------------------- class TestPoolRotationCycle: diff --git a/tests/agent/test_smart_model_routing.py b/tests/agent/test_smart_model_routing.py deleted file mode 100644 index 7e9025609..000000000 --- a/tests/agent/test_smart_model_routing.py +++ /dev/null @@ -1,61 +0,0 @@ -from agent.smart_model_routing import choose_cheap_model_route - - -_BASE_CONFIG = { - "enabled": True, - "cheap_model": { - "provider": "openrouter", - "model": "google/gemini-2.5-flash", - }, -} - - -def test_returns_none_when_disabled(): - cfg = {**_BASE_CONFIG, "enabled": False} - assert choose_cheap_model_route("what time is it in tokyo?", cfg) is None - - -def test_routes_short_simple_prompt(): - result = choose_cheap_model_route("what time is it in tokyo?", _BASE_CONFIG) - assert result is not None - assert result["provider"] == "openrouter" - assert result["model"] == "google/gemini-2.5-flash" - assert result["routing_reason"] == "simple_turn" - - -def test_skips_long_prompt(): - prompt = "please summarize this carefully " * 20 - assert choose_cheap_model_route(prompt, _BASE_CONFIG) is None - - -def test_skips_code_like_prompt(): - prompt = "debug this traceback: ```python\nraise ValueError('bad')\n```" - assert choose_cheap_model_route(prompt, _BASE_CONFIG) is None - - -def test_skips_tool_heavy_prompt_keywords(): - prompt = "implement a patch for this docker error" - assert choose_cheap_model_route(prompt, _BASE_CONFIG) is None - - -def test_resolve_turn_route_falls_back_to_primary_when_route_runtime_cannot_be_resolved(monkeypatch): - from agent.smart_model_routing import resolve_turn_route - - monkeypatch.setattr( - "hermes_cli.runtime_provider.resolve_runtime_provider", - lambda **kwargs: (_ for _ in ()).throw(RuntimeError("bad route")), - ) - result = resolve_turn_route( - "what time is it in tokyo?", - _BASE_CONFIG, - { - "model": "anthropic/claude-sonnet-4", - "provider": "openrouter", - "base_url": "https://openrouter.ai/api/v1", - "api_mode": "chat_completions", - "api_key": "sk-primary", - }, - ) - assert result["model"] == "anthropic/claude-sonnet-4" - assert result["runtime"]["provider"] == "openrouter" - assert result["label"] is None diff --git a/tests/cli/test_cli_provider_resolution.py b/tests/cli/test_cli_provider_resolution.py index fe4153c80..0c9aab82a 100644 --- a/tests/cli/test_cli_provider_resolution.py +++ b/tests/cli/test_cli_provider_resolution.py @@ -207,48 +207,11 @@ def test_cli_turn_routing_uses_primary_when_disabled(monkeypatch): shell.api_mode = "chat_completions" shell.base_url = "https://openrouter.ai/api/v1" shell.api_key = "sk-primary" - shell._smart_model_routing = {"enabled": False} result = shell._resolve_turn_agent_config("what time is it in tokyo?") assert result["model"] == "gpt-5" assert result["runtime"]["provider"] == "openrouter" - assert result["label"] is None - - -def test_cli_turn_routing_uses_cheap_model_when_simple(monkeypatch): - cli = _import_cli() - - def _runtime_resolve(**kwargs): - assert kwargs["requested"] == "zai" - return { - "provider": "zai", - "api_mode": "chat_completions", - "base_url": "https://open.z.ai/api/v1", - "api_key": "cheap-key", - "source": "env/config", - } - - monkeypatch.setattr("hermes_cli.runtime_provider.resolve_runtime_provider", _runtime_resolve) - - shell = cli.HermesCLI(model="anthropic/claude-sonnet-4", compact=True, max_turns=1) - shell.provider = "openrouter" - shell.api_mode = "chat_completions" - shell.base_url = "https://openrouter.ai/api/v1" - shell.api_key = "primary-key" - shell._smart_model_routing = { - "enabled": True, - "cheap_model": {"provider": "zai", "model": "glm-5-air"}, - "max_simple_chars": 160, - "max_simple_words": 28, - } - - result = shell._resolve_turn_agent_config("what time is it in tokyo?") - - assert result["model"] == "glm-5-air" - assert result["runtime"]["provider"] == "zai" - assert result["runtime"]["api_key"] == "cheap-key" - assert result["label"] is not None def test_cli_prefers_config_provider_over_stale_env_override(monkeypatch): diff --git a/tests/cli/test_fast_command.py b/tests/cli/test_fast_command.py index bc6c8e5fb..23a1a4aa9 100644 --- a/tests/cli/test_fast_command.py +++ b/tests/cli/test_fast_command.py @@ -183,27 +183,10 @@ class TestFastModeRouting(unittest.TestCase): acp_command=None, acp_args=[], _credential_pool=None, - _smart_model_routing={}, service_tier="priority", ) - original_runtime = { - "api_key": "***", - "base_url": "https://openrouter.ai/api/v1", - "provider": "openrouter", - "api_mode": "chat_completions", - "command": None, - "args": [], - "credential_pool": None, - } - - with patch("agent.smart_model_routing.resolve_turn_route", return_value={ - "model": "gpt-5.4", - "runtime": dict(original_runtime), - "label": None, - "signature": ("gpt-5.4", "openrouter", "https://openrouter.ai/api/v1", "chat_completions", None, ()), - }): - route = cli_mod.HermesCLI._resolve_turn_agent_config(stub, "hi") + route = cli_mod.HermesCLI._resolve_turn_agent_config(stub, "hi") # Provider should NOT have changed assert route["runtime"]["provider"] == "openrouter" @@ -222,26 +205,10 @@ class TestFastModeRouting(unittest.TestCase): acp_command=None, acp_args=[], _credential_pool=None, - _smart_model_routing={}, service_tier="priority", ) - primary_route = { - "model": "gpt-5.3-codex", - "runtime": { - "api_key": "***", - "base_url": "https://openrouter.ai/api/v1", - "provider": "openrouter", - "api_mode": "chat_completions", - "command": None, - "args": [], - "credential_pool": None, - }, - "label": None, - "signature": ("gpt-5.3-codex", "openrouter", "https://openrouter.ai/api/v1", "chat_completions", None, ()), - } - with patch("agent.smart_model_routing.resolve_turn_route", return_value=primary_route): - route = cli_mod.HermesCLI._resolve_turn_agent_config(stub, "hi") + route = cli_mod.HermesCLI._resolve_turn_agent_config(stub, "hi") assert route["runtime"]["provider"] == "openrouter" assert route.get("request_overrides") is None @@ -329,27 +296,10 @@ class TestAnthropicFastMode(unittest.TestCase): acp_command=None, acp_args=[], _credential_pool=None, - _smart_model_routing={}, service_tier="priority", ) - original_runtime = { - "api_key": "***", - "base_url": "https://api.anthropic.com", - "provider": "anthropic", - "api_mode": "anthropic_messages", - "command": None, - "args": [], - "credential_pool": None, - } - - with patch("agent.smart_model_routing.resolve_turn_route", return_value={ - "model": "claude-opus-4-6", - "runtime": dict(original_runtime), - "label": None, - "signature": ("claude-opus-4-6", "anthropic", "https://api.anthropic.com", "anthropic_messages", None, ()), - }): - route = cli_mod.HermesCLI._resolve_turn_agent_config(stub, "hi") + route = cli_mod.HermesCLI._resolve_turn_agent_config(stub, "hi") assert route["runtime"]["provider"] == "anthropic" assert route["request_overrides"] == {"speed": "fast"} diff --git a/tests/cron/test_codex_execution_paths.py b/tests/cron/test_codex_execution_paths.py index 354c95dde..65526f4a8 100644 --- a/tests/cron/test_codex_execution_paths.py +++ b/tests/cron/test_codex_execution_paths.py @@ -152,7 +152,6 @@ def test_gateway_run_agent_codex_path_handles_internal_401_refresh(monkeypatch): runner._provider_routing = {} runner._fallback_model = None runner._running_agents = {} - runner._smart_model_routing = {} from unittest.mock import MagicMock, AsyncMock runner.hooks = MagicMock() runner.hooks.emit = AsyncMock() diff --git a/tests/gateway/test_discord_channel_prompts.py b/tests/gateway/test_discord_channel_prompts.py index 9c475bded..e1efd734d 100644 --- a/tests/gateway/test_discord_channel_prompts.py +++ b/tests/gateway/test_discord_channel_prompts.py @@ -75,7 +75,6 @@ def _make_runner(): runner._service_tier = None runner._provider_routing = {} runner._fallback_model = None - runner._smart_model_routing = {} runner._running_agents = {} runner._pending_model_notes = {} runner._session_db = None diff --git a/tests/gateway/test_fast_command.py b/tests/gateway/test_fast_command.py index dc869ea17..82cc4fc64 100644 --- a/tests/gateway/test_fast_command.py +++ b/tests/gateway/test_fast_command.py @@ -4,7 +4,7 @@ import sys import threading import types from types import SimpleNamespace -from unittest.mock import AsyncMock, patch +from unittest.mock import AsyncMock import pytest import yaml @@ -53,7 +53,6 @@ def _make_runner(): runner._service_tier = None runner._provider_routing = {} runner._fallback_model = None - runner._smart_model_routing = {} runner._running_agents = {} runner._pending_model_notes = {} runner._session_db = None @@ -97,13 +96,7 @@ def test_turn_route_injects_priority_processing_without_changing_runtime(): "credential_pool": None, } - with patch("agent.smart_model_routing.resolve_turn_route", return_value={ - "model": "gpt-5.4", - "runtime": dict(runtime_kwargs), - "label": None, - "signature": ("gpt-5.4", "openrouter", "https://openrouter.ai/api/v1", "chat_completions", None, ()), - }): - route = gateway_run.GatewayRunner._resolve_turn_agent_config(runner, "hi", "gpt-5.4", runtime_kwargs) + route = gateway_run.GatewayRunner._resolve_turn_agent_config(runner, "hi", "gpt-5.4", runtime_kwargs) assert route["runtime"]["provider"] == "openrouter" assert route["runtime"]["api_mode"] == "chat_completions" @@ -123,13 +116,7 @@ def test_turn_route_skips_priority_processing_for_unsupported_models(): "credential_pool": None, } - with patch("agent.smart_model_routing.resolve_turn_route", return_value={ - "model": "gpt-5.3-codex", - "runtime": dict(runtime_kwargs), - "label": None, - "signature": ("gpt-5.3-codex", "openrouter", "https://openrouter.ai/api/v1", "chat_completions", None, ()), - }): - route = gateway_run.GatewayRunner._resolve_turn_agent_config(runner, "hi", "gpt-5.3-codex", runtime_kwargs) + route = gateway_run.GatewayRunner._resolve_turn_agent_config(runner, "hi", "gpt-5.3-codex", runtime_kwargs) assert route["request_overrides"] is None diff --git a/website/docs/integrations/providers.md b/website/docs/integrations/providers.md index 9d32fc21e..013c6a3e3 100644 --- a/website/docs/integrations/providers.md +++ b/website/docs/integrations/providers.md @@ -1165,39 +1165,6 @@ Supported providers: `openrouter`, `nous`, `openai-codex`, `copilot`, `copilot-a Fallback is configured exclusively through `config.yaml` — there are no environment variables for it. For full details on when it triggers, supported providers, and how it interacts with auxiliary tasks and delegation, see [Fallback Providers](/docs/user-guide/features/fallback-providers). ::: -## Smart Model Routing - -Optional cheap-vs-strong routing lets Hermes keep your main model for complex work while sending very short/simple turns to a cheaper model. - -```yaml -smart_model_routing: - enabled: true - max_simple_chars: 160 - max_simple_words: 28 - cheap_model: - provider: openrouter - model: google/gemini-2.5-flash - # base_url: http://localhost:8000/v1 # optional custom endpoint - # key_env: MY_CUSTOM_KEY # optional env var name for that endpoint's API key -``` - -How it works: -- If a turn is short, single-line, and does not look code/tool/debug heavy, Hermes may route it to `cheap_model` -- If the turn looks complex, Hermes stays on your primary model/provider -- If the cheap route cannot be resolved cleanly, Hermes falls back to the primary model automatically - -This is intentionally conservative. It is meant for quick, low-stakes turns like: -- short factual questions -- quick rewrites -- lightweight summaries - -It will avoid routing prompts that look like: -- coding/debugging work -- tool-heavy requests -- long or multi-line analysis asks - -Use this when you want lower latency or cost without fully changing your default model. - --- ## See Also