fix(context): resolve real Codex OAuth context windows (272k, not 1M) (#14935)

On ChatGPT Codex OAuth every gpt-5.x slug actually caps at 272,000 tokens, but Hermes was resolving gpt-5.5 / gpt-5.4 to 1,050,000 (from models.dev) because openai-codex aliases to the openai entry there. At 1.05M the compressor never fires and requests hard-fail with 'context window exceeded' around the real 272k boundary. Verified live against chatgpt.com/backend-api/codex/models: gpt-5.5, gpt-5.4, gpt-5.4-mini, gpt-5.3-codex, gpt-5.2-codex, gpt-5.2, gpt-5.1-codex-max → context_window = 272000 Changes: - agent/model_metadata.py: * _fetch_codex_oauth_context_lengths() — probe the Codex /models endpoint with the OAuth bearer token and read context_window per slug (1h in-memory TTL). * _resolve_codex_oauth_context_length() — prefer the live probe, fall back to hardcoded _CODEX_OAUTH_CONTEXT_FALLBACK (all 272k). * Wire into get_model_context_length() when provider=='openai-codex', running BEFORE the models.dev lookup (which returns 1.05M). Result persists via save_context_length() so subsequent lookups skip the probe entirely. * Fixed the now-wrong comment on the DEFAULT_CONTEXT_LENGTHS gpt-5.5 entry (400k was never right for Codex; it's the catch-all for providers we can't probe live). Tests (4 new in TestCodexOAuthContextLength): - fallback table used when no token is available (no models.dev leakage) - live probe overrides the fallback - probe failure (non-200) falls back to hardcoded 272k - non-codex providers (openrouter, direct openai) unaffected Non-codex context resolution is unchanged — the Codex branch only fires when provider=='openai-codex'.
2026-04-25 00:51:20 +00:00 · 2026-04-23 22:39:47 -07:00 · 2026-04-23 22:39:47 -07:00 · 51f4c9827f
commit 51f4c9827f
parent 2e78a2b6b2
2 changed files with 241 additions and 2 deletions
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@ -123,8 +123,9 @@ DEFAULT_CONTEXT_LENGTHS = {
    "claude": 200000,
    # OpenAI — GPT-5 family (most have 400k; specific overrides first)
    # Source: https://developers.openai.com/api/docs/models
-    # GPT-5.5 (launched Apr 23 2026). Verified via live ChatGPT codex/models
-    # endpoint: bare slug `gpt-5.5`, no -pro/-mini variants. 400k context on Codex.
+    # GPT-5.5 (launched Apr 23 2026). 400k is the fallback for providers we
+    # can't probe live. ChatGPT Codex OAuth actually caps lower (272k as of
+    # Apr 2026) and is resolved via _resolve_codex_oauth_context_length().
    "gpt-5.5": 400000,
    "gpt-5.4-nano": 400000,           # 400k (not 1.05M like full 5.4)
    "gpt-5.4-mini": 400000,           # 400k (not 1.05M like full 5.4)
@ -1005,6 +1006,115 @@ def _query_anthropic_context_length(model: str, base_url: str, api_key: str) ->
    return None


+# Known ChatGPT Codex OAuth context windows (observed via live
+# chatgpt.com/backend-api/codex/models probe, Apr 2026). These are the
+# `context_window` values, which are what Codex actually enforces — the
+# direct OpenAI API has larger limits for the same slugs, but Codex OAuth
+# caps lower (e.g. gpt-5.5 is 1.05M on the API, 272K on Codex).
+#
+# Used as a fallback when the live probe fails (no token, network error).
+# Longest keys first so substring match picks the most specific entry.
+_CODEX_OAUTH_CONTEXT_FALLBACK: Dict[str, int] = {
+    "gpt-5.1-codex-max": 272_000,
+    "gpt-5.1-codex-mini": 272_000,
+    "gpt-5.3-codex": 272_000,
+    "gpt-5.2-codex": 272_000,
+    "gpt-5.4-mini": 272_000,
+    "gpt-5.5": 272_000,
+    "gpt-5.4": 272_000,
+    "gpt-5.2": 272_000,
+    "gpt-5": 272_000,
+}
+
+
+_codex_oauth_context_cache: Dict[str, int] = {}
+_codex_oauth_context_cache_time: float = 0.0
+_CODEX_OAUTH_CONTEXT_CACHE_TTL = 3600  # 1 hour
+
+
+def _fetch_codex_oauth_context_lengths(access_token: str) -> Dict[str, int]:
+    """Probe the ChatGPT Codex /models endpoint for per-slug context windows.
+
+    Codex OAuth imposes its own context limits that differ from the direct
+    OpenAI API (e.g. gpt-5.5 is 1.05M on the API, 272K on Codex). The
+    `context_window` field in each model entry is the authoritative source.
+
+    Returns a ``{slug: context_window}`` dict. Empty on failure.
+    """
+    global _codex_oauth_context_cache, _codex_oauth_context_cache_time
+    now = time.time()
+    if (
+        _codex_oauth_context_cache
+        and now - _codex_oauth_context_cache_time < _CODEX_OAUTH_CONTEXT_CACHE_TTL
+    ):
+        return _codex_oauth_context_cache
+
+    try:
+        resp = requests.get(
+            "https://chatgpt.com/backend-api/codex/models?client_version=1.0.0",
+            headers={"Authorization": f"Bearer {access_token}"},
+            timeout=10,
+        )
+        if resp.status_code != 200:
+            logger.debug(
+                "Codex /models probe returned HTTP %s; falling back to hardcoded defaults",
+                resp.status_code,
+            )
+            return {}
+        data = resp.json()
+    except Exception as exc:
+        logger.debug("Codex /models probe failed: %s", exc)
+        return {}
+
+    entries = data.get("models", []) if isinstance(data, dict) else []
+    result: Dict[str, int] = {}
+    for item in entries:
+        if not isinstance(item, dict):
+            continue
+        slug = item.get("slug")
+        ctx = item.get("context_window")
+        if isinstance(slug, str) and isinstance(ctx, int) and ctx > 0:
+            result[slug.strip()] = ctx
+
+    if result:
+        _codex_oauth_context_cache = result
+        _codex_oauth_context_cache_time = now
+    return result
+
+
+def _resolve_codex_oauth_context_length(
+    model: str, access_token: str = ""
+) -> Optional[int]:
+    """Resolve a Codex OAuth model's real context window.
+
+    Prefers a live probe of chatgpt.com/backend-api/codex/models (when we
+    have a bearer token), then falls back to ``_CODEX_OAUTH_CONTEXT_FALLBACK``.
+    """
+    model_bare = _strip_provider_prefix(model).strip()
+    if not model_bare:
+        return None
+
+    if access_token:
+        live = _fetch_codex_oauth_context_lengths(access_token)
+        if model_bare in live:
+            return live[model_bare]
+        # Case-insensitive match in case casing drifts
+        model_lower = model_bare.lower()
+        for slug, ctx in live.items():
+            if slug.lower() == model_lower:
+                return ctx
+
+    # Fallback: longest-key-first substring match over hardcoded defaults.
+    model_lower = model_bare.lower()
+    for slug, ctx in sorted(
+        _CODEX_OAUTH_CONTEXT_FALLBACK.items(), key=lambda x: len(x[0]), reverse=True
+    ):
+        if slug in model_lower:
+            return ctx
+
+    return None
+
+
 def _resolve_nous_context_length(model: str) -> Optional[int]:
    """Resolve Nous Portal model context length via OpenRouter metadata.

@ -1149,6 +1259,15 @@ def get_model_context_length(
        ctx = _resolve_nous_context_length(model)
        if ctx:
            return ctx
+    if effective_provider == "openai-codex":
+        # Codex OAuth enforces lower context limits than the direct OpenAI
+        # API for the same slug (e.g. gpt-5.5 is 1.05M on the API but 272K
+        # on Codex). Authoritative source is Codex's own /models endpoint.
+        codex_ctx = _resolve_codex_oauth_context_length(model, access_token=api_key or "")
+        if codex_ctx:
+            if base_url:
+                save_context_length(model, base_url, codex_ctx)
+            return codex_ctx
    if effective_provider:
        from agent.models_dev import lookup_models_dev_context
        ctx = lookup_models_dev_context(effective_provider, model)
--- a/tests/agent/test_model_metadata.py
+++ b/tests/agent/test_model_metadata.py
@ -200,6 +200,126 @@ class TestDefaultContextLengths:
        assert len(DEFAULT_CONTEXT_LENGTHS) >= 10


+# =========================================================================
+# Codex OAuth context-window resolution (provider="openai-codex")
+# =========================================================================
+
+class TestCodexOAuthContextLength:
+    """ChatGPT Codex OAuth imposes lower context limits than the direct
+    OpenAI API for the same slugs. Verified Apr 2026 via live probe of
+    chatgpt.com/backend-api/codex/models: every model returns 272k, while
+    models.dev reports 1.05M for gpt-5.5/gpt-5.4 and 400k for the rest.
+    """
+
+    def setup_method(self):
+        import agent.model_metadata as mm
+        mm._codex_oauth_context_cache = {}
+        mm._codex_oauth_context_cache_time = 0.0
+
+    def test_fallback_table_used_without_token(self):
+        """With no access token, the hardcoded Codex fallback table wins
+        over models.dev (which reports 1.05M for gpt-5.5 but Codex is 272k).
+        """
+        from agent.model_metadata import get_model_context_length
+
+        with patch("agent.model_metadata.get_cached_context_length", return_value=None), \
+             patch("agent.model_metadata.save_context_length"):
+            for model in (
+                "gpt-5.5",
+                "gpt-5.4",
+                "gpt-5.4-mini",
+                "gpt-5.3-codex",
+                "gpt-5.2-codex",
+                "gpt-5.1-codex-max",
+                "gpt-5.1-codex-mini",
+            ):
+                ctx = get_model_context_length(
+                    model=model,
+                    base_url="https://chatgpt.com/backend-api/codex",
+                    api_key="",
+                    provider="openai-codex",
+                )
+                assert ctx == 272_000, (
+                    f"Codex {model}: expected 272000 fallback, got {ctx} "
+                    "(models.dev leakage?)"
+                )
+
+    def test_live_probe_overrides_fallback(self):
+        """When a token is provided, the live /models probe is preferred
+        and its context_window drives the result."""
+        from agent.model_metadata import get_model_context_length
+
+        fake_response = MagicMock()
+        fake_response.status_code = 200
+        fake_response.json.return_value = {
+            "models": [
+                {"slug": "gpt-5.5", "context_window": 300_000},
+                {"slug": "gpt-5.4", "context_window": 400_000},
+            ]
+        }
+
+        with patch("agent.model_metadata.requests.get", return_value=fake_response), \
+             patch("agent.model_metadata.get_cached_context_length", return_value=None), \
+             patch("agent.model_metadata.save_context_length"):
+            ctx_55 = get_model_context_length(
+                model="gpt-5.5",
+                base_url="https://chatgpt.com/backend-api/codex",
+                api_key="fake-token",
+                provider="openai-codex",
+            )
+            ctx_54 = get_model_context_length(
+                model="gpt-5.4",
+                base_url="https://chatgpt.com/backend-api/codex",
+                api_key="fake-token",
+                provider="openai-codex",
+            )
+        assert ctx_55 == 300_000
+        assert ctx_54 == 400_000
+
+    def test_probe_failure_falls_back_to_hardcoded(self):
+        """If the probe fails (non-200 / network error), we still return
+        the hardcoded 272k rather than leaking through to models.dev 1.05M."""
+        from agent.model_metadata import get_model_context_length
+
+        fake_response = MagicMock()
+        fake_response.status_code = 401
+        fake_response.json.return_value = {}
+
+        with patch("agent.model_metadata.requests.get", return_value=fake_response), \
+             patch("agent.model_metadata.get_cached_context_length", return_value=None), \
+             patch("agent.model_metadata.save_context_length"):
+            ctx = get_model_context_length(
+                model="gpt-5.5",
+                base_url="https://chatgpt.com/backend-api/codex",
+                api_key="expired-token",
+                provider="openai-codex",
+            )
+        assert ctx == 272_000
+
+    def test_non_codex_providers_unaffected(self):
+        """Resolving gpt-5.5 on non-Codex providers must NOT use the Codex
+        272k override — OpenRouter / direct OpenAI API have different limits.
+        """
+        from agent.model_metadata import get_model_context_length
+
+        # OpenRouter — should hit its own catalog path first; when mocked
+        # empty, falls through to hardcoded DEFAULT_CONTEXT_LENGTHS (400k).
+        with patch("agent.model_metadata.fetch_model_metadata", return_value={}), \
+             patch("agent.model_metadata.fetch_endpoint_model_metadata", return_value={}), \
+             patch("agent.model_metadata.get_cached_context_length", return_value=None), \
+             patch("agent.models_dev.lookup_models_dev_context", return_value=None):
+            ctx = get_model_context_length(
+                model="openai/gpt-5.5",
+                base_url="https://openrouter.ai/api/v1",
+                api_key="",
+                provider="openrouter",
+            )
+        assert ctx == 400_000, (
+            f"Non-Codex gpt-5.5 resolved to {ctx}; Codex 272k override "
+            "leaked outside openai-codex provider"
+        )
+
+
 # =========================================================================
 # get_model_context_length — resolution order
 # =========================================================================