From 05d8f11085fec55106a0d2e0ed2051baeb4b108c Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Fri, 24 Apr 2026 17:21:38 -0700
Subject: [PATCH] fix(/model): show provider-enforced context length, not raw
 models.dev (#15438)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

/model gpt-5.5 on openai-codex showed 'Context: 1,050,000 tokens' because
the display block used ModelInfo.context_window directly from models.dev.
Codex OAuth actually enforces 272K for the same slug, and the agent's
compressor already runs at 272K via get_model_context_length() — so the
banner + real context budget said 272K while /model lied with 1M.

Route the display context through a new resolve_display_context_length()
helper that always prefers agent.model_metadata.get_model_context_length
(which knows about Codex OAuth, Copilot, Nous caps) and only falls back
to models.dev when that returns nothing.

Fix applied to all 3 /model display sites:
  cli.py _handle_model_switch
  gateway/run.py picker on_model_selected callback
  gateway/run.py text-fallback confirmation

Reported by @emilstridell (Telegram, April 2026).
---
 cli.py                                        | 29 +++---
 gateway/run.py                                | 39 ++++----
 hermes_cli/model_switch.py                    | 36 ++++++++
 .../test_model_switch_context_display.py      | 90 +++++++++++++++++++
 4 files changed, 161 insertions(+), 33 deletions(-)
 create mode 100644 tests/hermes_cli/test_model_switch_context_display.py

diff --git a/cli.py b/cli.py
index 00937e9f9..abd4d2391 100644
--- a/cli.py
+++ b/cli.py
@@ -5374,29 +5374,26 @@ class HermesCLI:
         _cprint(f"  ✓ Model switched: {result.new_model}")
         _cprint(f"    Provider: {provider_label}")
 
-        # Rich metadata from models.dev
+        # Context: always resolve via the provider-aware chain so Codex OAuth,
+        # Copilot, and Nous-enforced caps win over the raw models.dev entry
+        # (e.g. gpt-5.5 is 1.05M on openai but 272K on Codex OAuth).
         mi = result.model_info
+        from hermes_cli.model_switch import resolve_display_context_length
+        ctx = resolve_display_context_length(
+            result.new_model,
+            result.target_provider,
+            base_url=result.base_url or self.base_url or "",
+            api_key=result.api_key or self.api_key or "",
+            model_info=mi,
+        )
+        if ctx:
+            _cprint(f"    Context: {ctx:,} tokens")
         if mi:
-            if mi.context_window:
-                _cprint(f"    Context: {mi.context_window:,} tokens")
             if mi.max_output:
                 _cprint(f"    Max output: {mi.max_output:,} tokens")
             if mi.has_cost_data():
                 _cprint(f"    Cost: {mi.format_cost()}")
             _cprint(f"    Capabilities: {mi.format_capabilities()}")
-        else:
-            # Fallback to old context length lookup
-            try:
-                from agent.model_metadata import get_model_context_length
-                ctx = get_model_context_length(
-                    result.new_model,
-                    base_url=result.base_url or self.base_url,
-                    api_key=result.api_key or self.api_key,
-                    provider=result.target_provider,
-                )
-                _cprint(f"    Context: {ctx:,} tokens")
-            except Exception:
-                pass
 
         # Cache notice
         cache_enabled = (
diff --git a/gateway/run.py b/gateway/run.py
index 3d9648bf7..f5c1858db 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -5659,9 +5659,17 @@ class GatewayRunner:
                         lines = [f"Model switched to `{result.new_model}`"]
                         lines.append(f"Provider: {plabel}")
                         mi = result.model_info
+                        from hermes_cli.model_switch import resolve_display_context_length
+                        ctx = resolve_display_context_length(
+                            result.new_model,
+                            result.target_provider,
+                            base_url=result.base_url or current_base_url or "",
+                            api_key=result.api_key or current_api_key or "",
+                            model_info=mi,
+                        )
+                        if ctx:
+                            lines.append(f"Context: {ctx:,} tokens")
                         if mi:
-                            if mi.context_window:
-                                lines.append(f"Context: {mi.context_window:,} tokens")
                             if mi.max_output:
                                 lines.append(f"Max output: {mi.max_output:,} tokens")
                             if mi.has_cost_data():
@@ -5795,28 +5803,25 @@ class GatewayRunner:
         lines = [f"Model switched to `{result.new_model}`"]
         lines.append(f"Provider: {provider_label}")
 
-        # Rich metadata from models.dev
+        # Context: always resolve via the provider-aware chain so Codex OAuth,
+        # Copilot, and Nous-enforced caps win over the raw models.dev entry.
         mi = result.model_info
+        from hermes_cli.model_switch import resolve_display_context_length
+        ctx = resolve_display_context_length(
+            result.new_model,
+            result.target_provider,
+            base_url=result.base_url or current_base_url or "",
+            api_key=result.api_key or current_api_key or "",
+            model_info=mi,
+        )
+        if ctx:
+            lines.append(f"Context: {ctx:,} tokens")
         if mi:
-            if mi.context_window:
-                lines.append(f"Context: {mi.context_window:,} tokens")
             if mi.max_output:
                 lines.append(f"Max output: {mi.max_output:,} tokens")
             if mi.has_cost_data():
                 lines.append(f"Cost: {mi.format_cost()}")
             lines.append(f"Capabilities: {mi.format_capabilities()}")
-        else:
-            try:
-                from agent.model_metadata import get_model_context_length
-                ctx = get_model_context_length(
-                    result.new_model,
-                    base_url=result.base_url or current_base_url,
-                    api_key=result.api_key or current_api_key,
-                    provider=result.target_provider,
-                )
-                lines.append(f"Context: {ctx:,} tokens")
-            except Exception:
-                pass
 
         # Cache notice
         cache_enabled = (
diff --git a/hermes_cli/model_switch.py b/hermes_cli/model_switch.py
index 6402fa469..cc4ec055f 100644
--- a/hermes_cli/model_switch.py
+++ b/hermes_cli/model_switch.py
@@ -527,6 +527,42 @@ def _resolve_alias_fallback(
     return None
 
 
+def resolve_display_context_length(
+    model: str,
+    provider: str,
+    base_url: str = "",
+    api_key: str = "",
+    model_info: Optional[ModelInfo] = None,
+) -> Optional[int]:
+    """Resolve the context length to show in /model output.
+
+    models.dev reports per-vendor context (e.g. gpt-5.5 = 1.05M on openai)
+    but provider-enforced limits can be lower (e.g. Codex OAuth caps the
+    same slug at 272k). The authoritative source is
+    ``agent.model_metadata.get_model_context_length`` which already knows
+    about Codex OAuth, Copilot, Nous, and falls back to models.dev for the
+    rest.
+
+    Prefer the provider-aware value; fall back to ``model_info.context_window``
+    only if the resolver returns nothing.
+    """
+    try:
+        from agent.model_metadata import get_model_context_length
+        ctx = get_model_context_length(
+            model,
+            base_url=base_url or "",
+            api_key=api_key or "",
+            provider=provider or None,
+        )
+        if ctx:
+            return int(ctx)
+    except Exception:
+        pass
+    if model_info is not None and model_info.context_window:
+        return int(model_info.context_window)
+    return None
+
+
 # ---------------------------------------------------------------------------
 # Core model-switching pipeline
 # ---------------------------------------------------------------------------
diff --git a/tests/hermes_cli/test_model_switch_context_display.py b/tests/hermes_cli/test_model_switch_context_display.py
new file mode 100644
index 000000000..e30c5a3c6
--- /dev/null
+++ b/tests/hermes_cli/test_model_switch_context_display.py
@@ -0,0 +1,90 @@
+"""Regression test for /model context-length display on provider-capped models.
+
+Bug (April 2026): `/model gpt-5.5` on openai-codex (ChatGPT OAuth) showed
+"Context: 1,050,000 tokens" because the display code used the raw models.dev
+``ModelInfo.context_window`` (which reports the direct-OpenAI API value) instead
+of the provider-aware resolver. The agent was actually running at 272K — Codex
+OAuth's enforced cap — so the display was lying to the user.
+
+Fix: ``resolve_display_context_length()`` prefers
+``agent.model_metadata.get_model_context_length`` (which knows about Codex OAuth,
+Copilot, Nous, etc.) and falls back to models.dev only if that returns nothing.
+"""
+from __future__ import annotations
+
+from unittest.mock import patch
+
+from hermes_cli.model_switch import resolve_display_context_length
+
+
+class _FakeModelInfo:
+    def __init__(self, ctx):
+        self.context_window = ctx
+
+
+class TestResolveDisplayContextLength:
+    def test_codex_oauth_overrides_models_dev(self):
+        """gpt-5.5 on openai-codex must show Codex's 272K cap, not models.dev's 1.05M."""
+        fake_mi = _FakeModelInfo(1_050_000)  # what models.dev reports
+        with patch(
+            "agent.model_metadata.get_model_context_length",
+            return_value=272_000,  # what Codex OAuth actually enforces
+        ):
+            ctx = resolve_display_context_length(
+                "gpt-5.5",
+                "openai-codex",
+                base_url="https://chatgpt.com/backend-api/codex",
+                api_key="",
+                model_info=fake_mi,
+            )
+        assert ctx == 272_000, (
+            "Codex OAuth's 272K cap must win over models.dev's 1.05M for gpt-5.5"
+        )
+
+    def test_falls_back_to_model_info_when_resolver_returns_none(self):
+        fake_mi = _FakeModelInfo(1_048_576)
+        with patch(
+            "agent.model_metadata.get_model_context_length", return_value=None
+        ):
+            ctx = resolve_display_context_length(
+                "some-model",
+                "some-provider",
+                model_info=fake_mi,
+            )
+        assert ctx == 1_048_576
+
+    def test_returns_none_when_both_sources_empty(self):
+        with patch(
+            "agent.model_metadata.get_model_context_length", return_value=None
+        ):
+            ctx = resolve_display_context_length(
+                "unknown-model",
+                "unknown-provider",
+                model_info=None,
+            )
+        assert ctx is None
+
+    def test_resolver_exception_falls_back_to_model_info(self):
+        fake_mi = _FakeModelInfo(200_000)
+        with patch(
+            "agent.model_metadata.get_model_context_length",
+            side_effect=RuntimeError("network down"),
+        ):
+            ctx = resolve_display_context_length(
+                "x", "y", model_info=fake_mi
+            )
+        assert ctx == 200_000
+
+    def test_prefers_resolver_even_when_model_info_has_larger_value(self):
+        """Invariant: provider-aware resolver is authoritative, even if models.dev
+        reports a bigger window."""
+        fake_mi = _FakeModelInfo(2_000_000)
+        with patch(
+            "agent.model_metadata.get_model_context_length", return_value=128_000
+        ):
+            ctx = resolve_display_context_length(
+                "capped-model",
+                "capped-provider",
+                model_info=fake_mi,
+            )
+        assert ctx == 128_000