fix(/model): show provider-enforced context length, not raw models.dev (#15438)

/model gpt-5.5 on openai-codex showed 'Context: 1,050,000 tokens' because the display block used ModelInfo.context_window directly from models.dev. Codex OAuth actually enforces 272K for the same slug, and the agent's compressor already runs at 272K via get_model_context_length() — so the banner + real context budget said 272K while /model lied with 1M. Route the display context through a new resolve_display_context_length() helper that always prefers agent.model_metadata.get_model_context_length (which knows about Codex OAuth, Copilot, Nous caps) and only falls back to models.dev when that returns nothing. Fix applied to all 3 /model display sites: cli.py _handle_model_switch gateway/run.py picker on_model_selected callback gateway/run.py text-fallback confirmation Reported by @emilstridell (Telegram, April 2026).
2026-04-25 00:51:20 +00:00 · 2026-04-24 17:21:38 -07:00 · 2026-04-24 17:21:38 -07:00 · 05d8f11085
commit 05d8f11085
parent 13038dc747
4 changed files with 161 additions and 33 deletions
--- a/cli.py
+++ b/cli.py
@ -5374,29 +5374,26 @@ class HermesCLI:
        _cprint(f"  ✓ Model switched: {result.new_model}")
        _cprint(f"    Provider: {provider_label}")
-        # Rich metadata from models.dev
+        # Context: always resolve via the provider-aware chain so Codex OAuth,
        # Copilot, and Nous-enforced caps win over the raw models.dev entry
        # (e.g. gpt-5.5 is 1.05M on openai but 272K on Codex OAuth).
        mi = result.model_info
        from hermes_cli.model_switch import resolve_display_context_length
        ctx = resolve_display_context_length(
            result.new_model,
            result.target_provider,
            base_url=result.base_url or self.base_url or "",
            api_key=result.api_key or self.api_key or "",
            model_info=mi,
        )
        if ctx:
            _cprint(f"    Context: {ctx:,} tokens")
        if mi:
            if mi.context_window:
                _cprint(f"    Context: {mi.context_window:,} tokens")
            if mi.max_output:
                _cprint(f"    Max output: {mi.max_output:,} tokens")
            if mi.has_cost_data():
                _cprint(f"    Cost: {mi.format_cost()}")
            _cprint(f"    Capabilities: {mi.format_capabilities()}")
        else:
            # Fallback to old context length lookup
            try:
                from agent.model_metadata import get_model_context_length
                ctx = get_model_context_length(
                    result.new_model,
                    base_url=result.base_url or self.base_url,
                    api_key=result.api_key or self.api_key,
                    provider=result.target_provider,
                )
                _cprint(f"    Context: {ctx:,} tokens")
            except Exception:
                pass
        # Cache notice
        cache_enabled = (
--- a/gateway/run.py
+++ b/gateway/run.py
@ -5659,9 +5659,17 @@ class GatewayRunner:
                        lines = [f"Model switched to `{result.new_model}`"]
                        lines.append(f"Provider: {plabel}")
                        mi = result.model_info
                        from hermes_cli.model_switch import resolve_display_context_length
                        ctx = resolve_display_context_length(
                            result.new_model,
                            result.target_provider,
                            base_url=result.base_url or current_base_url or "",
                            api_key=result.api_key or current_api_key or "",
                            model_info=mi,
                        )
                        if ctx:
                            lines.append(f"Context: {ctx:,} tokens")
                        if mi:
                            if mi.context_window:
                                lines.append(f"Context: {mi.context_window:,} tokens")
                            if mi.max_output:
                                lines.append(f"Max output: {mi.max_output:,} tokens")
                            if mi.has_cost_data():
@ -5795,28 +5803,25 @@ class GatewayRunner:
        lines = [f"Model switched to `{result.new_model}`"]
        lines.append(f"Provider: {provider_label}")
-        # Rich metadata from models.dev
+        # Context: always resolve via the provider-aware chain so Codex OAuth,
        # Copilot, and Nous-enforced caps win over the raw models.dev entry.
        mi = result.model_info
        from hermes_cli.model_switch import resolve_display_context_length
        ctx = resolve_display_context_length(
            result.new_model,
            result.target_provider,
            base_url=result.base_url or current_base_url or "",
            api_key=result.api_key or current_api_key or "",
            model_info=mi,
        )
        if ctx:
            lines.append(f"Context: {ctx:,} tokens")
        if mi:
            if mi.context_window:
                lines.append(f"Context: {mi.context_window:,} tokens")
            if mi.max_output:
                lines.append(f"Max output: {mi.max_output:,} tokens")
            if mi.has_cost_data():
                lines.append(f"Cost: {mi.format_cost()}")
            lines.append(f"Capabilities: {mi.format_capabilities()}")
        else:
            try:
                from agent.model_metadata import get_model_context_length
                ctx = get_model_context_length(
                    result.new_model,
                    base_url=result.base_url or current_base_url,
                    api_key=result.api_key or current_api_key,
                    provider=result.target_provider,
                )
                lines.append(f"Context: {ctx:,} tokens")
            except Exception:
                pass
        # Cache notice
        cache_enabled = (
--- a/hermes_cli/model_switch.py
+++ b/hermes_cli/model_switch.py
@ -527,6 +527,42 @@ def _resolve_alias_fallback(
    return None
 def resolve_display_context_length(
    model: str,
    provider: str,
    base_url: str = "",
    api_key: str = "",
    model_info: Optional[ModelInfo] = None,
 ) -> Optional[int]:
    """Resolve the context length to show in /model output.
    models.dev reports per-vendor context (e.g. gpt-5.5 = 1.05M on openai)
    but provider-enforced limits can be lower (e.g. Codex OAuth caps the
    same slug at 272k). The authoritative source is
    ``agent.model_metadata.get_model_context_length`` which already knows
    about Codex OAuth, Copilot, Nous, and falls back to models.dev for the
    rest.
    Prefer the provider-aware value; fall back to ``model_info.context_window``
    only if the resolver returns nothing.
    """
    try:
        from agent.model_metadata import get_model_context_length
        ctx = get_model_context_length(
            model,
            base_url=base_url or "",
            api_key=api_key or "",
            provider=provider or None,
        )
        if ctx:
            return int(ctx)
    except Exception:
        pass
    if model_info is not None and model_info.context_window:
        return int(model_info.context_window)
    return None
 # ---------------------------------------------------------------------------
 # Core model-switching pipeline
 # ---------------------------------------------------------------------------
--- a/tests/hermes_cli/test_model_switch_context_display.py
+++ b/tests/hermes_cli/test_model_switch_context_display.py
@ -0,0 +1,90 @@
 """Regression test for /model context-length display on provider-capped models.
 Bug (April 2026): `/model gpt-5.5` on openai-codex (ChatGPT OAuth) showed
 "Context: 1,050,000 tokens" because the display code used the raw models.dev
 ``ModelInfo.context_window`` (which reports the direct-OpenAI API value) instead
 of the provider-aware resolver. The agent was actually running at 272K — Codex
 OAuth's enforced cap — so the display was lying to the user.
 Fix: ``resolve_display_context_length()`` prefers
 ``agent.model_metadata.get_model_context_length`` (which knows about Codex OAuth,
 Copilot, Nous, etc.) and falls back to models.dev only if that returns nothing.
 """
 from __future__ import annotations
 from unittest.mock import patch
 from hermes_cli.model_switch import resolve_display_context_length
 class _FakeModelInfo:
    def __init__(self, ctx):
        self.context_window = ctx
 class TestResolveDisplayContextLength:
    def test_codex_oauth_overrides_models_dev(self):
        """gpt-5.5 on openai-codex must show Codex's 272K cap, not models.dev's 1.05M."""
        fake_mi = _FakeModelInfo(1_050_000)  # what models.dev reports
        with patch(
            "agent.model_metadata.get_model_context_length",
            return_value=272_000,  # what Codex OAuth actually enforces
        ):
            ctx = resolve_display_context_length(
                "gpt-5.5",
                "openai-codex",
                base_url="https://chatgpt.com/backend-api/codex",
                api_key="",
                model_info=fake_mi,
            )
        assert ctx == 272_000, (
            "Codex OAuth's 272K cap must win over models.dev's 1.05M for gpt-5.5"
        )
    def test_falls_back_to_model_info_when_resolver_returns_none(self):
        fake_mi = _FakeModelInfo(1_048_576)
        with patch(
            "agent.model_metadata.get_model_context_length", return_value=None
        ):
            ctx = resolve_display_context_length(
                "some-model",
                "some-provider",
                model_info=fake_mi,
            )
        assert ctx == 1_048_576
    def test_returns_none_when_both_sources_empty(self):
        with patch(
            "agent.model_metadata.get_model_context_length", return_value=None
        ):
            ctx = resolve_display_context_length(
                "unknown-model",
                "unknown-provider",
                model_info=None,
            )
        assert ctx is None
    def test_resolver_exception_falls_back_to_model_info(self):
        fake_mi = _FakeModelInfo(200_000)
        with patch(
            "agent.model_metadata.get_model_context_length",
            side_effect=RuntimeError("network down"),
        ):
            ctx = resolve_display_context_length(
                "x", "y", model_info=fake_mi
            )
        assert ctx == 200_000
    def test_prefers_resolver_even_when_model_info_has_larger_value(self):
        """Invariant: provider-aware resolver is authoritative, even if models.dev
        reports a bigger window."""
        fake_mi = _FakeModelInfo(2_000_000)
        with patch(
            "agent.model_metadata.get_model_context_length", return_value=128_000
        ):
            ctx = resolve_display_context_length(
                "capped-model",
                "capped-provider",
                model_info=fake_mi,
            )
        assert ctx == 128_000