fix(context): invalidate stale Codex OAuth cache entries >= 400k (#15078)

PR #14935 added a Codex-aware context resolver but only new lookups hit the live /models probe. Users who had run Hermes on gpt-5.5 / 5.4 BEFORE that PR already had the wrong value (e.g. 1,050,000 from models.dev) persisted in ~/.hermes/context_length_cache.yaml, and the cache-first lookup in get_model_context_length() returns it forever. Symptom (reported in the wild by Ludwig, min heo, Gaoge on current main at 6051fba9d, which is AFTER #14935): * Startup banner shows context usage against 1M * Compression fires late and then OpenAI hard-rejects with 'context length will be reduced from 1,050,000 to 128,000' around the real 272k boundary. Fix: when the step-1 cache returns a value for an openai-codex lookup, check whether it's >= 400k. Codex OAuth caps every slug at 272k (live probe values) so anything at or above 400k is definitionally a pre-#14935 leftover. Drop that entry from the on-disk cache and fall through to step 5, which runs the live /models probe and repersists the correct value (or 272k from the hardcoded fallback if the probe fails). Non-Codex providers and legitimately-cached Codex entries at 272k are untouched. Changes: - agent/model_metadata.py: * _invalidate_cached_context_length() — drop a single entry from context_length_cache.yaml and rewrite the file. * Step-1 cache check in get_model_context_length() now gates provider=='openai-codex' entries >= 400k through invalidation instead of returning them. Tests (3 new in TestCodexOAuthContextLength): - stale 1.05M Codex entry is dropped from disk AND re-resolved through the live probe to 272k; unrelated cache entries survive. - fresh 272k Codex entry is respected (no probe call, no invalidation). - non-Codex 1M entries (e.g. anthropic/claude-opus-4.6 on OpenRouter) are unaffected — the guard is strictly scoped to openai-codex. Full tests/agent/test_model_metadata.py: 88 passed.
2026-06-09 08:21:50 +00:00 · 2026-04-24 04:46:07 -07:00 · 2026-04-24 04:46:07 -07:00 · 346601ca8d
commit 346601ca8d
parent 18f3fc8a6f
2 changed files with 123 additions and 1 deletions
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@ -737,6 +737,22 @@ def get_cached_context_length(model: str, base_url: str) -> Optional[int]:
    return cache.get(key)


+def _invalidate_cached_context_length(model: str, base_url: str) -> None:
+    """Drop a stale cache entry so it gets re-resolved on the next lookup."""
+    key = f"{model}@{base_url}"
+    cache = _load_context_cache()
+    if key not in cache:
+        return
+    del cache[key]
+    path = _get_context_cache_path()
+    try:
+        path.parent.mkdir(parents=True, exist_ok=True)
+        with open(path, "w") as f:
+            yaml.dump({"context_lengths": cache}, f, default_flow_style=False)
+    except Exception as e:
+        logger.debug("Failed to invalidate context length cache entry %s: %s", key, e)
+
+
 def get_next_probe_tier(current_length: int) -> Optional[int]:
    """Return the next lower probe tier, or None if already at minimum."""
    for tier in CONTEXT_PROBE_TIERS:
@ -1205,7 +1221,21 @@ def get_model_context_length(
    if base_url:
        cached = get_cached_context_length(model, base_url)
        if cached is not None:
-            return cached
+            # Invalidate stale Codex OAuth cache entries: pre-PR #14935 builds
+            # resolved gpt-5.x to the direct-API value (e.g. 1.05M) via
+            # models.dev and persisted it. Codex OAuth caps at 272K for every
+            # slug, so any cached Codex entry at or above 400K is a leftover
+            # from the old resolution path. Drop it and fall through to the
+            # live /models probe in step 5 below.
+            if provider == "openai-codex" and cached >= 400_000:
+                logger.info(
+                    "Dropping stale Codex cache entry %s@%s -> %s (pre-fix value); "
+                    "re-resolving via live /models probe",
+                    model, base_url, f"{cached:,}",
+                )
+                _invalidate_cached_context_length(model, base_url)
+            else:
+                return cached

    # 2. Active endpoint metadata for truly custom/unknown endpoints.
    # Known providers (Copilot, OpenAI, Anthropic, etc.) skip this — their
--- a/tests/agent/test_model_metadata.py
+++ b/tests/agent/test_model_metadata.py
@ -319,6 +319,98 @@ class TestCodexOAuthContextLength:
            "leaked outside openai-codex provider"
        )

+    def test_stale_codex_cache_over_400k_is_invalidated(self, tmp_path, monkeypatch):
+        """Pre-PR #14935 builds cached gpt-5.5 at 1.05M (from models.dev)
+        before the Codex-aware branch existed. Upgrading users keep that
+        stale entry on disk and the cache-first lookup returns it forever.
+        Codex OAuth caps at 272k for every slug, so any cached Codex
+        entry >= 400k must be dropped and re-resolved via the live probe.
+        """
+        from agent import model_metadata as mm
+
+        # Isolate the cache file to tmp_path
+        cache_file = tmp_path / "context_length_cache.yaml"
+        monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
+
+        base_url = "https://chatgpt.com/backend-api/codex/"
+        stale_key = f"gpt-5.5@{base_url}"
+        other_key = "other-model@https://api.openai.com/v1/"
+        import yaml as _yaml
+        cache_file.write_text(_yaml.dump({"context_lengths": {
+            stale_key: 1_050_000,   # stale pre-fix value
+            other_key: 128_000,     # unrelated, must survive
+        }}))
+
+        fake_response = MagicMock()
+        fake_response.status_code = 200
+        fake_response.json.return_value = {
+            "models": [{"slug": "gpt-5.5", "context_window": 272_000}]
+        }
+
+        with patch("agent.model_metadata.requests.get", return_value=fake_response), \
+             patch("agent.model_metadata.save_context_length") as mock_save:
+            ctx = mm.get_model_context_length(
+                model="gpt-5.5",
+                base_url=base_url,
+                api_key="fake-token",
+                provider="openai-codex",
+            )
+
+        assert ctx == 272_000, f"Stale entry should have been re-resolved to 272k, got {ctx}"
+        # Live save was called with the fresh value
+        mock_save.assert_called_with("gpt-5.5", base_url, 272_000)
+        # The stale entry was removed from disk; unrelated entries survived
+        remaining = _yaml.safe_load(cache_file.read_text()).get("context_lengths", {})
+        assert stale_key not in remaining, "Stale entry was not invalidated from the cache file"
+        assert remaining.get(other_key) == 128_000, "Unrelated cache entries must not be touched"
+
+    def test_fresh_codex_cache_under_400k_is_respected(self, tmp_path, monkeypatch):
+        """Codex entries at the correct 272k must NOT be invalidated —
+        only stale pre-fix values (>= 400k) get dropped."""
+        from agent import model_metadata as mm
+
+        cache_file = tmp_path / "context_length_cache.yaml"
+        monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
+
+        base_url = "https://chatgpt.com/backend-api/codex/"
+        import yaml as _yaml
+        cache_file.write_text(_yaml.dump({"context_lengths": {
+            f"gpt-5.5@{base_url}": 272_000,
+        }}))
+
+        # If the invalidation incorrectly fired, this would be called; assert it isn't.
+        with patch("agent.model_metadata.requests.get") as mock_get:
+            ctx = mm.get_model_context_length(
+                model="gpt-5.5",
+                base_url=base_url,
+                api_key="fake-token",
+                provider="openai-codex",
+            )
+        assert ctx == 272_000
+        mock_get.assert_not_called()
+
+    def test_stale_invalidation_scoped_to_codex_provider(self, tmp_path, monkeypatch):
+        """A cached 1M entry for a non-Codex provider (e.g. Anthropic opus on
+        OpenRouter, legitimately 1M) must NOT be invalidated by this guard."""
+        from agent import model_metadata as mm
+
+        cache_file = tmp_path / "context_length_cache.yaml"
+        monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
+
+        base_url = "https://openrouter.ai/api/v1"
+        import yaml as _yaml
+        cache_file.write_text(_yaml.dump({"context_lengths": {
+            f"anthropic/claude-opus-4.6@{base_url}": 1_000_000,
+        }}))
+
+        ctx = mm.get_model_context_length(
+            model="anthropic/claude-opus-4.6",
+            base_url=base_url,
+            api_key="fake",
+            provider="openrouter",
+        )
+        assert ctx == 1_000_000, "Non-codex 1M cache entries must be respected"
+

 # =========================================================================
 # get_model_context_length — resolution order