From 05d8f11085fec55106a0d2e0ed2051baeb4b108c Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 24 Apr 2026 17:21:38 -0700 Subject: [PATCH] fix(/model): show provider-enforced context length, not raw models.dev (#15438) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit /model gpt-5.5 on openai-codex showed 'Context: 1,050,000 tokens' because the display block used ModelInfo.context_window directly from models.dev. Codex OAuth actually enforces 272K for the same slug, and the agent's compressor already runs at 272K via get_model_context_length() — so the banner + real context budget said 272K while /model lied with 1M. Route the display context through a new resolve_display_context_length() helper that always prefers agent.model_metadata.get_model_context_length (which knows about Codex OAuth, Copilot, Nous caps) and only falls back to models.dev when that returns nothing. Fix applied to all 3 /model display sites: cli.py _handle_model_switch gateway/run.py picker on_model_selected callback gateway/run.py text-fallback confirmation Reported by @emilstridell (Telegram, April 2026). --- cli.py | 29 +++--- gateway/run.py | 39 ++++---- hermes_cli/model_switch.py | 36 ++++++++ .../test_model_switch_context_display.py | 90 +++++++++++++++++++ 4 files changed, 161 insertions(+), 33 deletions(-) create mode 100644 tests/hermes_cli/test_model_switch_context_display.py diff --git a/cli.py b/cli.py index 00937e9f9..abd4d2391 100644 --- a/cli.py +++ b/cli.py @@ -5374,29 +5374,26 @@ class HermesCLI: _cprint(f" ✓ Model switched: {result.new_model}") _cprint(f" Provider: {provider_label}") - # Rich metadata from models.dev + # Context: always resolve via the provider-aware chain so Codex OAuth, + # Copilot, and Nous-enforced caps win over the raw models.dev entry + # (e.g. gpt-5.5 is 1.05M on openai but 272K on Codex OAuth). mi = result.model_info + from hermes_cli.model_switch import resolve_display_context_length + ctx = resolve_display_context_length( + result.new_model, + result.target_provider, + base_url=result.base_url or self.base_url or "", + api_key=result.api_key or self.api_key or "", + model_info=mi, + ) + if ctx: + _cprint(f" Context: {ctx:,} tokens") if mi: - if mi.context_window: - _cprint(f" Context: {mi.context_window:,} tokens") if mi.max_output: _cprint(f" Max output: {mi.max_output:,} tokens") if mi.has_cost_data(): _cprint(f" Cost: {mi.format_cost()}") _cprint(f" Capabilities: {mi.format_capabilities()}") - else: - # Fallback to old context length lookup - try: - from agent.model_metadata import get_model_context_length - ctx = get_model_context_length( - result.new_model, - base_url=result.base_url or self.base_url, - api_key=result.api_key or self.api_key, - provider=result.target_provider, - ) - _cprint(f" Context: {ctx:,} tokens") - except Exception: - pass # Cache notice cache_enabled = ( diff --git a/gateway/run.py b/gateway/run.py index 3d9648bf7..f5c1858db 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -5659,9 +5659,17 @@ class GatewayRunner: lines = [f"Model switched to `{result.new_model}`"] lines.append(f"Provider: {plabel}") mi = result.model_info + from hermes_cli.model_switch import resolve_display_context_length + ctx = resolve_display_context_length( + result.new_model, + result.target_provider, + base_url=result.base_url or current_base_url or "", + api_key=result.api_key or current_api_key or "", + model_info=mi, + ) + if ctx: + lines.append(f"Context: {ctx:,} tokens") if mi: - if mi.context_window: - lines.append(f"Context: {mi.context_window:,} tokens") if mi.max_output: lines.append(f"Max output: {mi.max_output:,} tokens") if mi.has_cost_data(): @@ -5795,28 +5803,25 @@ class GatewayRunner: lines = [f"Model switched to `{result.new_model}`"] lines.append(f"Provider: {provider_label}") - # Rich metadata from models.dev + # Context: always resolve via the provider-aware chain so Codex OAuth, + # Copilot, and Nous-enforced caps win over the raw models.dev entry. mi = result.model_info + from hermes_cli.model_switch import resolve_display_context_length + ctx = resolve_display_context_length( + result.new_model, + result.target_provider, + base_url=result.base_url or current_base_url or "", + api_key=result.api_key or current_api_key or "", + model_info=mi, + ) + if ctx: + lines.append(f"Context: {ctx:,} tokens") if mi: - if mi.context_window: - lines.append(f"Context: {mi.context_window:,} tokens") if mi.max_output: lines.append(f"Max output: {mi.max_output:,} tokens") if mi.has_cost_data(): lines.append(f"Cost: {mi.format_cost()}") lines.append(f"Capabilities: {mi.format_capabilities()}") - else: - try: - from agent.model_metadata import get_model_context_length - ctx = get_model_context_length( - result.new_model, - base_url=result.base_url or current_base_url, - api_key=result.api_key or current_api_key, - provider=result.target_provider, - ) - lines.append(f"Context: {ctx:,} tokens") - except Exception: - pass # Cache notice cache_enabled = ( diff --git a/hermes_cli/model_switch.py b/hermes_cli/model_switch.py index 6402fa469..cc4ec055f 100644 --- a/hermes_cli/model_switch.py +++ b/hermes_cli/model_switch.py @@ -527,6 +527,42 @@ def _resolve_alias_fallback( return None +def resolve_display_context_length( + model: str, + provider: str, + base_url: str = "", + api_key: str = "", + model_info: Optional[ModelInfo] = None, +) -> Optional[int]: + """Resolve the context length to show in /model output. + + models.dev reports per-vendor context (e.g. gpt-5.5 = 1.05M on openai) + but provider-enforced limits can be lower (e.g. Codex OAuth caps the + same slug at 272k). The authoritative source is + ``agent.model_metadata.get_model_context_length`` which already knows + about Codex OAuth, Copilot, Nous, and falls back to models.dev for the + rest. + + Prefer the provider-aware value; fall back to ``model_info.context_window`` + only if the resolver returns nothing. + """ + try: + from agent.model_metadata import get_model_context_length + ctx = get_model_context_length( + model, + base_url=base_url or "", + api_key=api_key or "", + provider=provider or None, + ) + if ctx: + return int(ctx) + except Exception: + pass + if model_info is not None and model_info.context_window: + return int(model_info.context_window) + return None + + # --------------------------------------------------------------------------- # Core model-switching pipeline # --------------------------------------------------------------------------- diff --git a/tests/hermes_cli/test_model_switch_context_display.py b/tests/hermes_cli/test_model_switch_context_display.py new file mode 100644 index 000000000..e30c5a3c6 --- /dev/null +++ b/tests/hermes_cli/test_model_switch_context_display.py @@ -0,0 +1,90 @@ +"""Regression test for /model context-length display on provider-capped models. + +Bug (April 2026): `/model gpt-5.5` on openai-codex (ChatGPT OAuth) showed +"Context: 1,050,000 tokens" because the display code used the raw models.dev +``ModelInfo.context_window`` (which reports the direct-OpenAI API value) instead +of the provider-aware resolver. The agent was actually running at 272K — Codex +OAuth's enforced cap — so the display was lying to the user. + +Fix: ``resolve_display_context_length()`` prefers +``agent.model_metadata.get_model_context_length`` (which knows about Codex OAuth, +Copilot, Nous, etc.) and falls back to models.dev only if that returns nothing. +""" +from __future__ import annotations + +from unittest.mock import patch + +from hermes_cli.model_switch import resolve_display_context_length + + +class _FakeModelInfo: + def __init__(self, ctx): + self.context_window = ctx + + +class TestResolveDisplayContextLength: + def test_codex_oauth_overrides_models_dev(self): + """gpt-5.5 on openai-codex must show Codex's 272K cap, not models.dev's 1.05M.""" + fake_mi = _FakeModelInfo(1_050_000) # what models.dev reports + with patch( + "agent.model_metadata.get_model_context_length", + return_value=272_000, # what Codex OAuth actually enforces + ): + ctx = resolve_display_context_length( + "gpt-5.5", + "openai-codex", + base_url="https://chatgpt.com/backend-api/codex", + api_key="", + model_info=fake_mi, + ) + assert ctx == 272_000, ( + "Codex OAuth's 272K cap must win over models.dev's 1.05M for gpt-5.5" + ) + + def test_falls_back_to_model_info_when_resolver_returns_none(self): + fake_mi = _FakeModelInfo(1_048_576) + with patch( + "agent.model_metadata.get_model_context_length", return_value=None + ): + ctx = resolve_display_context_length( + "some-model", + "some-provider", + model_info=fake_mi, + ) + assert ctx == 1_048_576 + + def test_returns_none_when_both_sources_empty(self): + with patch( + "agent.model_metadata.get_model_context_length", return_value=None + ): + ctx = resolve_display_context_length( + "unknown-model", + "unknown-provider", + model_info=None, + ) + assert ctx is None + + def test_resolver_exception_falls_back_to_model_info(self): + fake_mi = _FakeModelInfo(200_000) + with patch( + "agent.model_metadata.get_model_context_length", + side_effect=RuntimeError("network down"), + ): + ctx = resolve_display_context_length( + "x", "y", model_info=fake_mi + ) + assert ctx == 200_000 + + def test_prefers_resolver_even_when_model_info_has_larger_value(self): + """Invariant: provider-aware resolver is authoritative, even if models.dev + reports a bigger window.""" + fake_mi = _FakeModelInfo(2_000_000) + with patch( + "agent.model_metadata.get_model_context_length", return_value=128_000 + ): + ctx = resolve_display_context_length( + "capped-model", + "capped-provider", + model_info=fake_mi, + ) + assert ctx == 128_000