perf(tools): cache get_nous_auth_status() and load_env() to fix slow hermes tools menus (#25341)

`hermes tools` -> "All Platforms" took ~14s to render the checklist because building the toolset labels called `get_nous_auth_status()` ~31x transitively (`_toolset_has_keys` -> `_visible_providers` -> `get_nous_subscription_features` -> `managed_nous_tools_enabled`). Each call did a synchronous OAuth refresh POST to portal.nousresearch.com (~350ms even on the failure path), so one menu paint burned >13s of HTTP and 31 single-use Nous refresh tokens. Secondary hot spot: every `get_env_value()` re-read and re-sanitised the entire .env file. 116 reads with O(lines x known-keys) scanning added ~300ms of CPU per render. Fix is two process-level caches, both mtime-keyed so login/logout/edit invalidate naturally: * `hermes_cli/auth.py`: memoise `get_nous_auth_status()` for 15s keyed on auth.json mtime. Splits `_compute_nous_auth_status()` as the uncached impl. Adds `invalidate_nous_auth_status_cache()`. * `hermes_cli/config.py`: memoise `load_env()` keyed on .env (path, mtime, size). Adds `invalidate_env_cache()`, wired into `save_env_value`, `remove_env_value`, and the sanitize-on-load writer so writers don't return stale dicts on same-second writes. Before/after on Teknium's box (real HERMES_HOME, no Nous login): * "All Platforms" cold path: ~13,874ms -> ~691ms label-build * Warm re-open within the same process: ~122ms -> ~17ms Side benefit: stops burning a Nous refresh token on every menu paint, which was risking the portal's reuse-detection revocation logic.
2026-05-23 05:31:23 +00:00 · 2026-05-13 18:40:14 -07:00 · 2026-05-13 18:40:14 -07:00 · 3f13d78088
commit 3f13d78088
parent dd5a9502e3
4 changed files with 449 additions and 4 deletions
--- a/tests/hermes_cli/test_nous_auth_status_cache.py
+++ b/tests/hermes_cli/test_nous_auth_status_cache.py
@ -0,0 +1,144 @@
+"""Tests for the get_nous_auth_status() process-level cache.
+
+The cache avoids re-validating Nous credentials on every menu paint —
+`hermes tools` → "All Platforms" used to fire ~31 OAuth refresh POSTs
+against portal.nousresearch.com during one render. The cache is keyed
+on auth.json mtime so login/logout flows invalidate naturally; tests
+and other writers can also call invalidate_nous_auth_status_cache().
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from unittest.mock import patch
+
+
+def _seed_auth_file(tmp_path):
+    """Drop a placeholder auth.json into the test HERMES_HOME.
+
+    The exact content doesn't matter for cache-key purposes — only that
+    the file exists and we can mutate it to bump mtime.
+    """
+    auth = tmp_path / "auth.json"
+    auth.write_text(json.dumps({"providers": {}}), encoding="utf-8")
+    return auth
+
+
+def test_get_nous_auth_status_caches_consecutive_calls(tmp_path, monkeypatch):
+    """A second call within the TTL skips re-computing the snapshot."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    _seed_auth_file(tmp_path)
+
+    from hermes_cli import auth as auth_mod
+
+    auth_mod.invalidate_nous_auth_status_cache()
+
+    call_count = {"n": 0}
+
+    def fake_compute():
+        call_count["n"] += 1
+        return {"logged_in": False, "source": "auth_store", "call": call_count["n"]}
+
+    with patch.object(auth_mod, "_compute_nous_auth_status", side_effect=fake_compute):
+        first = auth_mod.get_nous_auth_status()
+        second = auth_mod.get_nous_auth_status()
+        third = auth_mod.get_nous_auth_status()
+
+    assert call_count["n"] == 1, (
+        f"_compute_nous_auth_status was called {call_count['n']}× — "
+        "cache is not deduplicating within TTL."
+    )
+    # Each call returns a copy so callers can't mutate the cached dict.
+    assert first == second == third
+    first["mutated"] = True
+    assert "mutated" not in auth_mod.get_nous_auth_status()
+
+    auth_mod.invalidate_nous_auth_status_cache()
+
+
+def test_get_nous_auth_status_invalidates_on_auth_file_mtime(tmp_path, monkeypatch):
+    """Touching auth.json (login/logout) forces a re-compute."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    auth_path = _seed_auth_file(tmp_path)
+
+    from hermes_cli import auth as auth_mod
+
+    auth_mod.invalidate_nous_auth_status_cache()
+
+    call_count = {"n": 0}
+
+    def fake_compute():
+        call_count["n"] += 1
+        return {"logged_in": False, "source": "auth_store", "call": call_count["n"]}
+
+    with patch.object(auth_mod, "_compute_nous_auth_status", side_effect=fake_compute):
+        auth_mod.get_nous_auth_status()
+        # Bump mtime forward so coarse-resolution filesystems still record
+        # a change.
+        future = auth_path.stat().st_mtime + 5.0
+        os.utime(auth_path, (future, future))
+        auth_mod.get_nous_auth_status()
+
+    assert call_count["n"] == 2, (
+        "auth.json mtime change should invalidate the cache, but only "
+        f"{call_count['n']} compute call(s) happened."
+    )
+
+    auth_mod.invalidate_nous_auth_status_cache()
+
+
+def test_invalidate_nous_auth_status_cache_forces_recompute(tmp_path, monkeypatch):
+    """Explicit invalidate forces the next call to re-compute."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    _seed_auth_file(tmp_path)
+
+    from hermes_cli import auth as auth_mod
+
+    auth_mod.invalidate_nous_auth_status_cache()
+
+    call_count = {"n": 0}
+
+    def fake_compute():
+        call_count["n"] += 1
+        return {"logged_in": False, "source": "auth_store"}
+
+    with patch.object(auth_mod, "_compute_nous_auth_status", side_effect=fake_compute):
+        auth_mod.get_nous_auth_status()
+        auth_mod.invalidate_nous_auth_status_cache()
+        auth_mod.get_nous_auth_status()
+
+    assert call_count["n"] == 2
+
+    auth_mod.invalidate_nous_auth_status_cache()
+
+
+def test_get_nous_auth_status_caches_failure_path(tmp_path, monkeypatch):
+    """Logged-out snapshots are cached too — that's where the cost was.
+
+    Teknium's case: ~31 cache misses per `hermes tools` "All Platforms"
+    menu paint, all returning logged_in=False after a failed refresh POST.
+    The whole point of the cache is to memoise that failure path too.
+    """
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    _seed_auth_file(tmp_path)
+
+    from hermes_cli import auth as auth_mod
+
+    auth_mod.invalidate_nous_auth_status_cache()
+
+    call_count = {"n": 0}
+
+    def fake_compute():
+        call_count["n"] += 1
+        return {"logged_in": False, "source": "auth_store", "error": "refresh failed"}
+
+    with patch.object(auth_mod, "_compute_nous_auth_status", side_effect=fake_compute):
+        for _ in range(10):
+            auth_mod.get_nous_auth_status()
+
+    assert call_count["n"] == 1, (
+        f"Logged-out snapshots must cache; got {call_count['n']} computes for 10 calls."
+    )
+
+    auth_mod.invalidate_nous_auth_status_cache()