perf(models_dev): cache-first lookup, skip network when disk cache is fresh (#22808)

`fetch_models_dev()` is on the hot path of every `AIAgent.__init__`
(via `context_compressor → get_model_context_length`). The previous
policy was "always try network first, only fall back to disk if
network fails," so every fresh `hermes chat` / `hermes gateway` /
batch / cron process paid 250-500 ms re-fetching a 2 MB JSON registry
that was already on disk from earlier runs.

Add a stage 2 between in-mem and network: if
`models_dev_cache.json` exists and its mtime is younger than the
existing `_MODELS_DEV_CACHE_TTL` (1 hour, same TTL the in-mem cache
already uses), load from disk and skip the network call.

The in-mem TTL is anchored to the disk file's age, so a 50-min-old
cache stays in-memory for only 10 more minutes — no surprise
extension of staleness window.

Invariants preserved:
- `force_refresh=True` still always hits the network and only falls
  back to disk on failure (`hermes config refresh` semantics).
- Missing disk cache → fall through to network (first-ever run).
- Stale disk cache (mtime > TTL) → fall through to network.
- Negative file age (clock skew) → fall through to network.
- Network failure → existing stage-4 stale-disk fallback unchanged.

Measured impact (3-run medians, 9950X3D, fresh process per run):
  fetch_models_dev cold:  256 → 17 ms  (-93%)
  hermes chat -q wall:   4.00 → 3.73 s (-7% median)
                         3.99 → 3.60 s (-10% min)

The chat-end-to-end win is bounded below by API latency variance, but
the fetch_models_dev microbenchmark is the cleanest signal: 239 ms
shaved off every fresh-process agent construction.

Win compounds with the previous perf PRs:
  #22681 google_chat lazy-load
  #22766 doctor parallel + IMDS off
  #22790 gateway.platforms PEP 562

Tests: all 30 `tests/agent/test_models_dev.py` pass (added 4 new ones
covering the new disk-cache-first path, force_refresh override, stale
disk fallback, and missing-disk-cache fall-through). Full `tests/agent/`
suite: 2560 passed, 0 failed.
This commit is contained in:
Teknium 2026-05-09 13:32:38 -07:00 committed by GitHub
parent cd712b176a
commit 775c0e22cf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 164 additions and 5 deletions

View file

@ -197,6 +197,32 @@ def _load_disk_cache() -> Dict[str, Any]:
return {} return {}
def _disk_cache_age_seconds() -> Optional[float]:
"""Return age (in seconds) of the disk cache file, or None if missing.
Used by ``fetch_models_dev`` to short-circuit the network probe when
a recent on-disk cache exists. Errors (missing file, permission
denied, weird filesystem) all return None callers fall through
to the network fetch path.
"""
try:
cache_path = _get_cache_path()
if not cache_path.exists():
return None
mtime = cache_path.stat().st_mtime
age = time.time() - mtime
# Negative age means the file's mtime is in the future (clock skew
# or system clock reset). Treat as "unknown freshness" → fall
# through to network so we don't serve potentially-bad data
# forever.
if age < 0:
return None
return age
except Exception as e:
logger.debug("Failed to stat models.dev disk cache: %s", e)
return None
def _save_disk_cache(data: Dict[str, Any]) -> None: def _save_disk_cache(data: Dict[str, Any]) -> None:
"""Save models.dev data to disk cache atomically.""" """Save models.dev data to disk cache atomically."""
try: try:
@ -207,13 +233,29 @@ def _save_disk_cache(data: Dict[str, Any]) -> None:
def fetch_models_dev(force_refresh: bool = False) -> Dict[str, Any]: def fetch_models_dev(force_refresh: bool = False) -> Dict[str, Any]:
"""Fetch models.dev registry. In-memory cache (1hr) + disk fallback. """Fetch models.dev registry. Cache hierarchy: in-mem → disk → network.
Returns the full registry dict keyed by provider ID, or empty dict on failure. Returns the full registry dict keyed by provider ID, or empty dict on failure.
Cache hierarchy (when ``force_refresh=False``):
1. In-memory cache, populated and < TTL old return immediately.
2. **Disk cache file < TTL old by mtime load, populate in-mem, return.**
No network call. Saves ~500 ms per cold-start agent construction;
``models.dev`` only changes when providers add new models, so a
1 hour staleness window is acceptable (same TTL as in-mem cache).
3. Network fetch on success, save to disk + in-mem and return.
4. Network fails fall back to ANY available disk cache (even stale)
with a short 5 min in-mem grace period before retrying network.
When ``force_refresh=True`` (used by ``hermes config refresh``, the
\"refresh model catalog\" code path), stages 1 and 2 are skipped. The
function always hits the network and only falls back to disk if the
network call fails.
""" """
global _models_dev_cache, _models_dev_cache_time global _models_dev_cache, _models_dev_cache_time
# Check in-memory cache # Stage 1: fresh in-memory cache wins. This is the hot path on
# long-lived processes — no I/O, no system calls.
if ( if (
not force_refresh not force_refresh
and _models_dev_cache and _models_dev_cache
@ -221,7 +263,27 @@ def fetch_models_dev(force_refresh: bool = False) -> Dict[str, Any]:
): ):
return _models_dev_cache return _models_dev_cache
# Try network fetch # Stage 2: fresh-by-mtime disk cache short-circuits the network call.
# Only kicks in on cold-start processes (in-mem cache is empty or
# expired) and only when the user hasn't asked for a forced refresh.
# Skipped if the disk cache file is missing, unreadable, or older
# than _MODELS_DEV_CACHE_TTL.
if not force_refresh:
disk_age = _disk_cache_age_seconds()
if disk_age is not None and disk_age < _MODELS_DEV_CACHE_TTL:
disk_data = _load_disk_cache()
if disk_data:
_models_dev_cache = disk_data
# Anchor in-mem TTL to the disk file's age so we don't
# extend an already-aging cache by another full hour.
_models_dev_cache_time = time.time() - disk_age
logger.debug(
"Loaded models.dev from fresh disk cache "
"(%d providers, age=%.0fs)", len(disk_data), disk_age,
)
return _models_dev_cache
# Stage 3: network fetch.
try: try:
response = requests.get(MODELS_DEV_URL, timeout=15) response = requests.get(MODELS_DEV_URL, timeout=15)
response.raise_for_status() response.raise_for_status()
@ -239,8 +301,9 @@ def fetch_models_dev(force_refresh: bool = False) -> Dict[str, Any]:
except Exception as e: except Exception as e:
logger.debug("Failed to fetch models.dev: %s", e) logger.debug("Failed to fetch models.dev: %s", e)
# Fall back to disk cache — use a short TTL (5 min) so we retry # Stage 4: network failed — fall back to whatever disk cache exists,
# the network fetch soon instead of serving stale data for a full hour. # even if it's stale. Give it a short 5 min in-mem TTL so we retry
# the network soon instead of serving stale data for a full hour.
if not _models_dev_cache: if not _models_dev_cache:
_models_dev_cache = _load_disk_cache() _models_dev_cache = _load_disk_cache()
if _models_dev_cache: if _models_dev_cache:

View file

@ -201,6 +201,102 @@ class TestFetchModelsDev:
mock_get.assert_not_called() mock_get.assert_not_called()
assert result == SAMPLE_REGISTRY assert result == SAMPLE_REGISTRY
@patch("agent.models_dev.requests.get")
def test_fresh_disk_cache_skips_network(self, mock_get):
"""When in-mem cache is empty but disk cache exists and is fresh by
mtime (< TTL), fetch_models_dev returns disk data without ever
making the network call.
This is the cold-start fast path: every fresh process previously
paid ~500 ms re-fetching a registry that was already on disk
from an earlier run.
"""
import agent.models_dev as md
# Empty in-mem cache so stage 1 doesn't short-circuit.
md._models_dev_cache = {}
md._models_dev_cache_time = 0
with patch.object(md, "_disk_cache_age_seconds", return_value=60.0), \
patch.object(md, "_load_disk_cache", return_value=SAMPLE_REGISTRY):
result = fetch_models_dev()
# The whole point: no network call.
mock_get.assert_not_called()
assert "anthropic" in result
# In-mem cache populated so subsequent calls within the same
# process stay on stage 1.
assert md._models_dev_cache == SAMPLE_REGISTRY
@patch("agent.models_dev.requests.get")
def test_stale_disk_cache_falls_through_to_network(self, mock_get):
"""When the disk cache is OLDER than TTL, we must hit the network
(and only fall back to the stale disk data if network fails)."""
import agent.models_dev as md
md._models_dev_cache = {}
md._models_dev_cache_time = 0
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.json.return_value = SAMPLE_REGISTRY
mock_resp.raise_for_status = MagicMock()
mock_get.return_value = mock_resp
# Disk cache exists but is older than the TTL — must NOT short-circuit.
with patch.object(md, "_disk_cache_age_seconds",
return_value=md._MODELS_DEV_CACHE_TTL + 60), \
patch.object(md, "_load_disk_cache", return_value=SAMPLE_REGISTRY), \
patch.object(md, "_save_disk_cache"):
result = fetch_models_dev()
mock_get.assert_called_once()
assert "anthropic" in result
@patch("agent.models_dev.requests.get")
def test_force_refresh_skips_disk_cache(self, mock_get):
"""force_refresh=True bypasses BOTH the in-mem cache AND the
disk-cache fast path. Used by ``hermes config refresh`` and
anywhere else the user explicitly asked for fresh data.
"""
import agent.models_dev as md
md._models_dev_cache = {}
md._models_dev_cache_time = 0
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.json.return_value = SAMPLE_REGISTRY
mock_resp.raise_for_status = MagicMock()
mock_get.return_value = mock_resp
# Disk cache is fresh, but force_refresh must override it.
with patch.object(md, "_disk_cache_age_seconds", return_value=60.0), \
patch.object(md, "_load_disk_cache", return_value=SAMPLE_REGISTRY), \
patch.object(md, "_save_disk_cache"):
result = fetch_models_dev(force_refresh=True)
mock_get.assert_called_once()
assert "anthropic" in result
@patch("agent.models_dev.requests.get")
def test_missing_disk_cache_falls_through_to_network(self, mock_get):
"""If the disk cache file doesn't exist (first-ever run, or it
was deleted), fall through cleanly to network."""
import agent.models_dev as md
md._models_dev_cache = {}
md._models_dev_cache_time = 0
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.json.return_value = SAMPLE_REGISTRY
mock_resp.raise_for_status = MagicMock()
mock_get.return_value = mock_resp
with patch.object(md, "_disk_cache_age_seconds", return_value=None), \
patch.object(md, "_save_disk_cache"):
result = fetch_models_dev()
mock_get.assert_called_once()
assert "anthropic" in result
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# get_model_capabilities — vision via modalities.input # get_model_capabilities — vision via modalities.input