fix(context): invalidate stale Codex OAuth cache entries >= 400k (#15078)

PR #14935 added a Codex-aware context resolver but only new lookups
hit the live /models probe. Users who had run Hermes on gpt-5.5 / 5.4
BEFORE that PR already had the wrong value (e.g. 1,050,000 from
models.dev) persisted in ~/.hermes/context_length_cache.yaml, and the
cache-first lookup in get_model_context_length() returns it forever.

Symptom (reported in the wild by Ludwig, min heo, Gaoge on current
main at 6051fba9d, which is AFTER #14935):
  * Startup banner shows context usage against 1M
  * Compression fires late and then OpenAI hard-rejects with
    'context length will be reduced from 1,050,000 to 128,000'
    around the real 272k boundary.

Fix: when the step-1 cache returns a value for an openai-codex lookup,
check whether it's >= 400k. Codex OAuth caps every slug at 272k (live
probe values) so anything at or above 400k is definitionally a
pre-#14935 leftover. Drop that entry from the on-disk cache and fall
through to step 5, which runs the live /models probe and repersists
the correct value (or 272k from the hardcoded fallback if the probe
fails). Non-Codex providers and legitimately-cached Codex entries at
272k are untouched.

Changes:
- agent/model_metadata.py:
  * _invalidate_cached_context_length() — drop a single entry from
    context_length_cache.yaml and rewrite the file.
  * Step-1 cache check in get_model_context_length() now gates
    provider=='openai-codex' entries >= 400k through invalidation
    instead of returning them.

Tests (3 new in TestCodexOAuthContextLength):
- stale 1.05M Codex entry is dropped from disk AND re-resolved
  through the live probe to 272k; unrelated cache entries survive.
- fresh 272k Codex entry is respected (no probe call, no invalidation).
- non-Codex 1M entries (e.g. anthropic/claude-opus-4.6 on OpenRouter)
  are unaffected — the guard is strictly scoped to openai-codex.

Full tests/agent/test_model_metadata.py: 88 passed.
This commit is contained in:
Teknium 2026-04-24 04:46:07 -07:00 committed by GitHub
parent 18f3fc8a6f
commit 346601ca8d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 123 additions and 1 deletions

View file

@ -737,6 +737,22 @@ def get_cached_context_length(model: str, base_url: str) -> Optional[int]:
return cache.get(key)
def _invalidate_cached_context_length(model: str, base_url: str) -> None:
"""Drop a stale cache entry so it gets re-resolved on the next lookup."""
key = f"{model}@{base_url}"
cache = _load_context_cache()
if key not in cache:
return
del cache[key]
path = _get_context_cache_path()
try:
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w") as f:
yaml.dump({"context_lengths": cache}, f, default_flow_style=False)
except Exception as e:
logger.debug("Failed to invalidate context length cache entry %s: %s", key, e)
def get_next_probe_tier(current_length: int) -> Optional[int]:
"""Return the next lower probe tier, or None if already at minimum."""
for tier in CONTEXT_PROBE_TIERS:
@ -1205,7 +1221,21 @@ def get_model_context_length(
if base_url:
cached = get_cached_context_length(model, base_url)
if cached is not None:
return cached
# Invalidate stale Codex OAuth cache entries: pre-PR #14935 builds
# resolved gpt-5.x to the direct-API value (e.g. 1.05M) via
# models.dev and persisted it. Codex OAuth caps at 272K for every
# slug, so any cached Codex entry at or above 400K is a leftover
# from the old resolution path. Drop it and fall through to the
# live /models probe in step 5 below.
if provider == "openai-codex" and cached >= 400_000:
logger.info(
"Dropping stale Codex cache entry %s@%s -> %s (pre-fix value); "
"re-resolving via live /models probe",
model, base_url, f"{cached:,}",
)
_invalidate_cached_context_length(model, base_url)
else:
return cached
# 2. Active endpoint metadata for truly custom/unknown endpoints.
# Known providers (Copilot, OpenAI, Anthropic, etc.) skip this — their

View file

@ -319,6 +319,98 @@ class TestCodexOAuthContextLength:
"leaked outside openai-codex provider"
)
def test_stale_codex_cache_over_400k_is_invalidated(self, tmp_path, monkeypatch):
"""Pre-PR #14935 builds cached gpt-5.5 at 1.05M (from models.dev)
before the Codex-aware branch existed. Upgrading users keep that
stale entry on disk and the cache-first lookup returns it forever.
Codex OAuth caps at 272k for every slug, so any cached Codex
entry >= 400k must be dropped and re-resolved via the live probe.
"""
from agent import model_metadata as mm
# Isolate the cache file to tmp_path
cache_file = tmp_path / "context_length_cache.yaml"
monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
base_url = "https://chatgpt.com/backend-api/codex/"
stale_key = f"gpt-5.5@{base_url}"
other_key = "other-model@https://api.openai.com/v1/"
import yaml as _yaml
cache_file.write_text(_yaml.dump({"context_lengths": {
stale_key: 1_050_000, # stale pre-fix value
other_key: 128_000, # unrelated, must survive
}}))
fake_response = MagicMock()
fake_response.status_code = 200
fake_response.json.return_value = {
"models": [{"slug": "gpt-5.5", "context_window": 272_000}]
}
with patch("agent.model_metadata.requests.get", return_value=fake_response), \
patch("agent.model_metadata.save_context_length") as mock_save:
ctx = mm.get_model_context_length(
model="gpt-5.5",
base_url=base_url,
api_key="fake-token",
provider="openai-codex",
)
assert ctx == 272_000, f"Stale entry should have been re-resolved to 272k, got {ctx}"
# Live save was called with the fresh value
mock_save.assert_called_with("gpt-5.5", base_url, 272_000)
# The stale entry was removed from disk; unrelated entries survived
remaining = _yaml.safe_load(cache_file.read_text()).get("context_lengths", {})
assert stale_key not in remaining, "Stale entry was not invalidated from the cache file"
assert remaining.get(other_key) == 128_000, "Unrelated cache entries must not be touched"
def test_fresh_codex_cache_under_400k_is_respected(self, tmp_path, monkeypatch):
"""Codex entries at the correct 272k must NOT be invalidated —
only stale pre-fix values (>= 400k) get dropped."""
from agent import model_metadata as mm
cache_file = tmp_path / "context_length_cache.yaml"
monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
base_url = "https://chatgpt.com/backend-api/codex/"
import yaml as _yaml
cache_file.write_text(_yaml.dump({"context_lengths": {
f"gpt-5.5@{base_url}": 272_000,
}}))
# If the invalidation incorrectly fired, this would be called; assert it isn't.
with patch("agent.model_metadata.requests.get") as mock_get:
ctx = mm.get_model_context_length(
model="gpt-5.5",
base_url=base_url,
api_key="fake-token",
provider="openai-codex",
)
assert ctx == 272_000
mock_get.assert_not_called()
def test_stale_invalidation_scoped_to_codex_provider(self, tmp_path, monkeypatch):
"""A cached 1M entry for a non-Codex provider (e.g. Anthropic opus on
OpenRouter, legitimately 1M) must NOT be invalidated by this guard."""
from agent import model_metadata as mm
cache_file = tmp_path / "context_length_cache.yaml"
monkeypatch.setattr(mm, "_get_context_cache_path", lambda: cache_file)
base_url = "https://openrouter.ai/api/v1"
import yaml as _yaml
cache_file.write_text(_yaml.dump({"context_lengths": {
f"anthropic/claude-opus-4.6@{base_url}": 1_000_000,
}}))
ctx = mm.get_model_context_length(
model="anthropic/claude-opus-4.6",
base_url=base_url,
api_key="fake",
provider="openrouter",
)
assert ctx == 1_000_000, "Non-codex 1M cache entries must be respected"
# =========================================================================
# get_model_context_length — resolution order