fix(context): resolve real Codex OAuth context windows (272k, not 1M) (#14935)

On ChatGPT Codex OAuth every gpt-5.x slug actually caps at 272,000 tokens, but Hermes was resolving gpt-5.5 / gpt-5.4 to 1,050,000 (from models.dev) because openai-codex aliases to the openai entry there. At 1.05M the compressor never fires and requests hard-fail with 'context window exceeded' around the real 272k boundary. Verified live against chatgpt.com/backend-api/codex/models: gpt-5.5, gpt-5.4, gpt-5.4-mini, gpt-5.3-codex, gpt-5.2-codex, gpt-5.2, gpt-5.1-codex-max → context_window = 272000 Changes: - agent/model_metadata.py: * _fetch_codex_oauth_context_lengths() — probe the Codex /models endpoint with the OAuth bearer token and read context_window per slug (1h in-memory TTL). * _resolve_codex_oauth_context_length() — prefer the live probe, fall back to hardcoded _CODEX_OAUTH_CONTEXT_FALLBACK (all 272k). * Wire into get_model_context_length() when provider=='openai-codex', running BEFORE the models.dev lookup (which returns 1.05M). Result persists via save_context_length() so subsequent lookups skip the probe entirely. * Fixed the now-wrong comment on the DEFAULT_CONTEXT_LENGTHS gpt-5.5 entry (400k was never right for Codex; it's the catch-all for providers we can't probe live). Tests (4 new in TestCodexOAuthContextLength): - fallback table used when no token is available (no models.dev leakage) - live probe overrides the fallback - probe failure (non-200) falls back to hardcoded 272k - non-codex providers (openrouter, direct openai) unaffected Non-codex context resolution is unchanged — the Codex branch only fires when provider=='openai-codex'.
2026-04-25 00:51:20 +00:00 · 2026-04-23 22:39:47 -07:00 · 2026-04-23 22:39:47 -07:00 · 51f4c9827f
commit 51f4c9827f
parent 2e78a2b6b2
2 changed files with 241 additions and 2 deletions
--- a/tests/agent/test_model_metadata.py
+++ b/tests/agent/test_model_metadata.py
@ -200,6 +200,126 @@ class TestDefaultContextLengths:
        assert len(DEFAULT_CONTEXT_LENGTHS) >= 10


+# =========================================================================
+# Codex OAuth context-window resolution (provider="openai-codex")
+# =========================================================================
+
+class TestCodexOAuthContextLength:
+    """ChatGPT Codex OAuth imposes lower context limits than the direct
+    OpenAI API for the same slugs. Verified Apr 2026 via live probe of
+    chatgpt.com/backend-api/codex/models: every model returns 272k, while
+    models.dev reports 1.05M for gpt-5.5/gpt-5.4 and 400k for the rest.
+    """
+
+    def setup_method(self):
+        import agent.model_metadata as mm
+        mm._codex_oauth_context_cache = {}
+        mm._codex_oauth_context_cache_time = 0.0
+
+    def test_fallback_table_used_without_token(self):
+        """With no access token, the hardcoded Codex fallback table wins
+        over models.dev (which reports 1.05M for gpt-5.5 but Codex is 272k).
+        """
+        from agent.model_metadata import get_model_context_length
+
+        with patch("agent.model_metadata.get_cached_context_length", return_value=None), \
+             patch("agent.model_metadata.save_context_length"):
+            for model in (
+                "gpt-5.5",
+                "gpt-5.4",
+                "gpt-5.4-mini",
+                "gpt-5.3-codex",
+                "gpt-5.2-codex",
+                "gpt-5.1-codex-max",
+                "gpt-5.1-codex-mini",
+            ):
+                ctx = get_model_context_length(
+                    model=model,
+                    base_url="https://chatgpt.com/backend-api/codex",
+                    api_key="",
+                    provider="openai-codex",
+                )
+                assert ctx == 272_000, (
+                    f"Codex {model}: expected 272000 fallback, got {ctx} "
+                    "(models.dev leakage?)"
+                )
+
+    def test_live_probe_overrides_fallback(self):
+        """When a token is provided, the live /models probe is preferred
+        and its context_window drives the result."""
+        from agent.model_metadata import get_model_context_length
+
+        fake_response = MagicMock()
+        fake_response.status_code = 200
+        fake_response.json.return_value = {
+            "models": [
+                {"slug": "gpt-5.5", "context_window": 300_000},
+                {"slug": "gpt-5.4", "context_window": 400_000},
+            ]
+        }
+
+        with patch("agent.model_metadata.requests.get", return_value=fake_response), \
+             patch("agent.model_metadata.get_cached_context_length", return_value=None), \
+             patch("agent.model_metadata.save_context_length"):
+            ctx_55 = get_model_context_length(
+                model="gpt-5.5",
+                base_url="https://chatgpt.com/backend-api/codex",
+                api_key="fake-token",
+                provider="openai-codex",
+            )
+            ctx_54 = get_model_context_length(
+                model="gpt-5.4",
+                base_url="https://chatgpt.com/backend-api/codex",
+                api_key="fake-token",
+                provider="openai-codex",
+            )
+        assert ctx_55 == 300_000
+        assert ctx_54 == 400_000
+
+    def test_probe_failure_falls_back_to_hardcoded(self):
+        """If the probe fails (non-200 / network error), we still return
+        the hardcoded 272k rather than leaking through to models.dev 1.05M."""
+        from agent.model_metadata import get_model_context_length
+
+        fake_response = MagicMock()
+        fake_response.status_code = 401
+        fake_response.json.return_value = {}
+
+        with patch("agent.model_metadata.requests.get", return_value=fake_response), \
+             patch("agent.model_metadata.get_cached_context_length", return_value=None), \
+             patch("agent.model_metadata.save_context_length"):
+            ctx = get_model_context_length(
+                model="gpt-5.5",
+                base_url="https://chatgpt.com/backend-api/codex",
+                api_key="expired-token",
+                provider="openai-codex",
+            )
+        assert ctx == 272_000
+
+    def test_non_codex_providers_unaffected(self):
+        """Resolving gpt-5.5 on non-Codex providers must NOT use the Codex
+        272k override — OpenRouter / direct OpenAI API have different limits.
+        """
+        from agent.model_metadata import get_model_context_length
+
+        # OpenRouter — should hit its own catalog path first; when mocked
+        # empty, falls through to hardcoded DEFAULT_CONTEXT_LENGTHS (400k).
+        with patch("agent.model_metadata.fetch_model_metadata", return_value={}), \
+             patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}), \
+             patch("agent.model_metadata.get_cached_context_length", return_value=None), \
+             patch("agent.models_dev.lookup_models_dev_context", return_value=None):
+            ctx = get_model_context_length(
+                model="openai/gpt-5.5",
+                base_url="https://openrouter.ai/api/v1",
+                api_key="",
+                provider="openrouter",
+            )
+        assert ctx == 400_000, (
+            f"Non-Codex gpt-5.5 resolved to {ctx}; Codex 272k override "
+            "leaked outside openai-codex provider"
+        )
+
+
 # =========================================================================
 # get_model_context_length — resolution order
 # =========================================================================