perf(tools): cache get_nous_auth_status() and load_env() to fix slow hermes tools menus (#25341)

`hermes tools` -> "All Platforms" took ~14s to render the checklist because building the toolset labels called `get_nous_auth_status()` ~31x transitively (`_toolset_has_keys` -> `_visible_providers` -> `get_nous_subscription_features` -> `managed_nous_tools_enabled`). Each call did a synchronous OAuth refresh POST to portal.nousresearch.com (~350ms even on the failure path), so one menu paint burned >13s of HTTP and 31 single-use Nous refresh tokens. Secondary hot spot: every `get_env_value()` re-read and re-sanitised the entire .env file. 116 reads with O(lines x known-keys) scanning added ~300ms of CPU per render. Fix is two process-level caches, both mtime-keyed so login/logout/edit invalidate naturally: * `hermes_cli/auth.py`: memoise `get_nous_auth_status()` for 15s keyed on auth.json mtime. Splits `_compute_nous_auth_status()` as the uncached impl. Adds `invalidate_nous_auth_status_cache()`. * `hermes_cli/config.py`: memoise `load_env()` keyed on .env (path, mtime, size). Adds `invalidate_env_cache()`, wired into `save_env_value`, `remove_env_value`, and the sanitize-on-load writer so writers don't return stale dicts on same-second writes. Before/after on Teknium's box (real HERMES_HOME, no Nous login): * "All Platforms" cold path: ~13,874ms -> ~691ms label-build * Warm re-open within the same process: ~122ms -> ~17ms Side benefit: stops burning a Nous refresh token on every menu paint, which was risking the portal's reuse-detection revocation logic.
2026-05-18 04:41:56 +00:00 · 2026-05-13 18:40:14 -07:00 · 2026-05-13 18:40:14 -07:00 · 3f13d78088
commit 3f13d78088
parent dd5a9502e3
4 changed files with 449 additions and 4 deletions
--- a/hermes_cli/auth.py
+++ b/hermes_cli/auth.py
@ -35,7 +35,7 @@ from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from http.server import BaseHTTPRequestHandler, HTTPServer
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 from urllib.parse import parse_qs, urlencode, urlparse

 import httpx
@ -3870,6 +3870,39 @@ def _snapshot_nous_pool_status() -> Dict[str, Any]:
        return _empty_nous_auth_status()


+# ── Process-level memo for get_nous_auth_status() ──
+# get_nous_auth_status() validates state by calling resolve_nous_runtime_credentials(),
+# which does a synchronous OAuth refresh POST to portal.nousresearch.com. That can take
+# ~350ms even on the failure path, and read-only UI surfaces (`hermes tools`, status panels,
+# subscription-feature checks) call it many times per render — `hermes tools` → "All Platforms"
+# was firing the refresh ~31× during one menu paint, racking up >13s of HTTP and burning
+# single-use refresh tokens. Cache the snapshot for a few seconds, keyed on the auth.json
+# mtime so that `hermes auth login/logout/add/remove` invalidate naturally on the next call.
+_NOUS_AUTH_STATUS_CACHE_TTL = 15.0  # seconds
+_nous_auth_status_cache: Optional[Tuple[float, Optional[float], Dict[str, Any]]] = None
+
+
+def _auth_file_mtime() -> Optional[float]:
+    try:
+        return _auth_file_path().stat().st_mtime
+    except FileNotFoundError:
+        return None
+    except Exception:
+        return None
+
+
+def invalidate_nous_auth_status_cache() -> None:
+    """Clear the get_nous_auth_status() process-level memo.
+
+    Call this from any code path that mutates Nous auth state without going
+    through resolve_nous_runtime_credentials() (e.g. tests). Login/logout
+    flows touch auth.json, so the mtime check below invalidates them
+    automatically — explicit invalidation is the belt-and-braces option.
+    """
+    global _nous_auth_status_cache
+    _nous_auth_status_cache = None
+
+
 def get_nous_auth_status() -> Dict[str, Any]:
    """Status snapshot for Nous auth.

@ -3878,7 +3911,32 @@ def get_nous_auth_status() -> Dict[str, Any]:
    by resolving runtime credentials so revoked refresh sessions do not show up
    as a healthy login. If provider state is absent, fall back to the credential
    pool for the just-logged-in / not-yet-promoted case.
+
+    The returned snapshot is memoised for ~15s keyed on the auth.json mtime,
+    so menu/status surfaces that ask repeatedly don't trigger one refresh POST
+    per call. Login/logout flows write to auth.json and therefore invalidate
+    the cache automatically; tests can also call
+    ``invalidate_nous_auth_status_cache()`` explicitly.
    """
+    global _nous_auth_status_cache
+    now = time.monotonic()
+    mtime = _auth_file_mtime()
+    cached = _nous_auth_status_cache
+    if cached is not None:
+        cached_at, cached_mtime, cached_status = cached
+        if (
+            cached_mtime == mtime
+            and (now - cached_at) < _NOUS_AUTH_STATUS_CACHE_TTL
+        ):
+            return dict(cached_status)
+
+    status = _compute_nous_auth_status()
+    _nous_auth_status_cache = (now, mtime, dict(status))
+    return status
+
+
+def _compute_nous_auth_status() -> Dict[str, Any]:
+    """Uncached implementation of get_nous_auth_status(). See that function."""
    state = get_provider_auth_state("nous")
    if state:
        base_status = {
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -4319,10 +4319,34 @@ def load_env() -> Dict[str, str]:
    concatenated KEY=VALUE pairs on a single line) are handled
    gracefully instead of producing mangled values such as duplicated
    bot tokens.  See #8908.
+
+    The parsed dict is memoised keyed on the .env file mtime, because
+    ``get_env_value()`` is called dozens-to-hundreds of times per
+    interactive menu render (`hermes tools`, `hermes setup`, status
+    panels). Sanitisation is O(lines × known-keys), so re-parsing the
+    same file on every call was burning ~300ms of CPU per `hermes tools`
+    menu paint on top of the OAuth-refresh slowness. The mtime check
+    invalidates the cache when the user edits .env mid-process.
    """
+    global _env_cache
    env_path = get_env_path()
-    env_vars = {}
-    
+
+    try:
+        mtime = env_path.stat().st_mtime
+        size = env_path.stat().st_size
+        cache_key = (str(env_path), mtime, size)
+    except FileNotFoundError:
+        cache_key = (str(env_path), None, None)
+    except Exception:
+        cache_key = None
+
+    if cache_key is not None and _env_cache is not None:
+        cached_key, cached_vars = _env_cache
+        if cached_key == cache_key:
+            return dict(cached_vars)
+
+    env_vars: Dict[str, str] = {}
+
    if env_path.exists():
        # On Windows, open() defaults to the system locale (cp1252) which can
        # fail on UTF-8 .env files. Always use explicit UTF-8; tolerate BOM
@ -4338,10 +4362,33 @@ def load_env() -> Dict[str, str]:
            if line and not line.startswith('#') and '=' in line:
                key, _, value = line.partition('=')
                env_vars[key.strip()] = value.strip().strip('"\'')
-    
+
+    if cache_key is not None:
+        _env_cache = (cache_key, dict(env_vars))
+
    return env_vars


+# Module-level memo for load_env(), keyed on (path, mtime, size).
+# Editing .env bumps mtime → next load_env() rebuilds. invalidate_env_cache()
+# is the explicit knob for writers that update .env via this module
+# (set_env_value, save_env, etc.) without relying on filesystem mtime
+# resolution.
+_env_cache: Optional[Tuple[Tuple[str, Optional[float], Optional[int]], Dict[str, str]]] = None
+
+
+def invalidate_env_cache() -> None:
+    """Clear the load_env() process-level memo.
+
+    Writers that mutate .env (set_env_value, save_env, etc.) call this
+    to guarantee the next load_env() sees their change even on
+    filesystems with coarse mtime resolution. Reads invalidate naturally
+    via the mtime/size check.
+    """
+    global _env_cache
+    _env_cache = None
+
+
 def _sanitize_env_lines(lines: list) -> list:
    """Fix corrupted .env lines before reading or writing.

@ -4444,6 +4491,7 @@ def sanitize_env_file() -> int:
            pass
        raise
    _secure_file(env_path)
+    invalidate_env_cache()
    return fixes


@ -4555,6 +4603,7 @@ def save_env_value(key: str, value: str):
    _secure_file(env_path)

    os.environ[key] = value
+    invalidate_env_cache()


 def remove_env_value(key: str) -> bool:
@ -4610,6 +4659,7 @@ def remove_env_value(key: str) -> bool:
        _secure_file(env_path)

    os.environ.pop(key, None)
+    invalidate_env_cache()
    return found


--- a/tests/hermes_cli/test_env_load_cache.py
+++ b/tests/hermes_cli/test_env_load_cache.py
@ -0,0 +1,193 @@
+"""Tests for the load_env() process-level cache.
+
+The cache exists to keep `hermes tools` → "All Platforms" fast: every
+`get_env_value()` lookup used to re-read and re-sanitise the entire
+.env file, racking up hundreds of ms across one menu render. The
+cache is keyed on (path, mtime, size); writers (save_env_value /
+remove_env_value / sanitise_env_file) call invalidate_env_cache().
+"""
+
+from __future__ import annotations
+
+import os
+import tempfile
+from pathlib import Path
+from unittest.mock import patch
+
+
+def _write_env(path: Path, contents: str) -> None:
+    path.write_text(contents, encoding="utf-8")
+
+
+def test_load_env_caches_on_repeat_calls():
+    """Repeated load_env() calls on the same file return the cached dict."""
+    from hermes_cli.config import invalidate_env_cache, load_env
+
+    invalidate_env_cache()
+
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".env", delete=False, encoding="utf-8"
+    ) as f:
+        f.write("OPENAI_API_KEY=sk-first\n")
+        env_path = Path(f.name)
+
+    try:
+        with patch("hermes_cli.config.get_env_path", return_value=env_path):
+            first = load_env()
+            # Even if a writer outside our cache mutates the file, an
+            # mtime/size match means the cache still wins. We simulate that
+            # by writing identical bytes back — sanity check that the cache
+            # is keyed structurally, not on a counter.
+            second = load_env()
+
+        assert first == second
+        assert first.get("OPENAI_API_KEY") == "sk-first"
+    finally:
+        env_path.unlink(missing_ok=True)
+        invalidate_env_cache()
+
+
+def test_load_env_invalidates_on_mtime_bump():
+    """Editing the file (mtime changes) invalidates the cache."""
+    from hermes_cli.config import invalidate_env_cache, load_env
+
+    invalidate_env_cache()
+
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".env", delete=False, encoding="utf-8"
+    ) as f:
+        f.write("OPENAI_API_KEY=sk-old\n")
+        env_path = Path(f.name)
+
+    try:
+        with patch("hermes_cli.config.get_env_path", return_value=env_path):
+            first = load_env()
+            assert first.get("OPENAI_API_KEY") == "sk-old"
+
+            # Rewrite file with new contents and bump mtime to make sure
+            # the FS records the change even on coarse-mtime filesystems.
+            _write_env(env_path, "OPENAI_API_KEY=sk-new\n")
+            future = env_path.stat().st_mtime + 5.0
+            os.utime(env_path, (future, future))
+
+            second = load_env()
+            assert second.get("OPENAI_API_KEY") == "sk-new", (
+                "load_env() returned stale value after file change"
+            )
+    finally:
+        env_path.unlink(missing_ok=True)
+        invalidate_env_cache()
+
+
+def test_invalidate_env_cache_forces_reread():
+    """invalidate_env_cache() forces the next load_env() to hit the disk.
+
+    This is the belt-and-braces knob for writers (save_env_value, etc.)
+    on filesystems where mtime resolution might miss a same-second write.
+    """
+    from hermes_cli.config import invalidate_env_cache, load_env
+
+    invalidate_env_cache()
+
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".env", delete=False, encoding="utf-8"
+    ) as f:
+        f.write("OPENAI_API_KEY=sk-old\n")
+        env_path = Path(f.name)
+
+    try:
+        with patch("hermes_cli.config.get_env_path", return_value=env_path):
+            assert load_env().get("OPENAI_API_KEY") == "sk-old"
+
+            # Rewrite WITHOUT bumping mtime — simulates same-second write.
+            mtime_before = env_path.stat().st_mtime
+            _write_env(env_path, "OPENAI_API_KEY=sk-new\n")
+            os.utime(env_path, (mtime_before, mtime_before))
+
+            # Without invalidation, cache hit might return stale.
+            invalidate_env_cache()
+
+            assert load_env().get("OPENAI_API_KEY") == "sk-new"
+    finally:
+        env_path.unlink(missing_ok=True)
+        invalidate_env_cache()
+
+
+def test_save_env_value_invalidates_cache(tmp_path, monkeypatch):
+    """save_env_value() invalidates the cache so subsequent reads see the update."""
+    from hermes_cli import config as config_mod
+    from hermes_cli.config import invalidate_env_cache, load_env, save_env_value
+
+    invalidate_env_cache()
+
+    env_path = tmp_path / ".env"
+    env_path.write_text("EXISTING_KEY=old\n", encoding="utf-8")
+
+    monkeypatch.setattr(config_mod, "get_env_path", lambda: env_path)
+    monkeypatch.setattr(config_mod, "ensure_hermes_home", lambda: None)
+    monkeypatch.setattr(config_mod, "_secure_file", lambda _p: None)
+    monkeypatch.setattr(config_mod, "is_managed", lambda: False)
+
+    try:
+        # Prime the cache.
+        first = load_env()
+        assert first.get("EXISTING_KEY") == "old"
+
+        save_env_value("NEW_KEY", "shiny")
+
+        # Same-second writes on coarse-mtime filesystems would normally
+        # let stale cache survive; invalidate_env_cache() inside the
+        # writer makes the next read see the new key.
+        result = load_env()
+        assert result.get("NEW_KEY") == "shiny"
+        assert result.get("EXISTING_KEY") == "old"
+    finally:
+        monkeypatch.delenv("NEW_KEY", raising=False)
+        invalidate_env_cache()
+
+
+def test_remove_env_value_invalidates_cache(tmp_path, monkeypatch):
+    """remove_env_value() invalidates the cache so the removed key disappears."""
+    from hermes_cli import config as config_mod
+    from hermes_cli.config import (
+        invalidate_env_cache,
+        load_env,
+        remove_env_value,
+        save_env_value,
+    )
+
+    invalidate_env_cache()
+
+    env_path = tmp_path / ".env"
+    monkeypatch.setattr(config_mod, "get_env_path", lambda: env_path)
+    monkeypatch.setattr(config_mod, "ensure_hermes_home", lambda: None)
+    monkeypatch.setattr(config_mod, "_secure_file", lambda _p: None)
+    monkeypatch.setattr(config_mod, "is_managed", lambda: False)
+
+    save_env_value("DOOMED_KEY", "value")
+    assert load_env().get("DOOMED_KEY") == "value"
+
+    try:
+        removed = remove_env_value("DOOMED_KEY")
+        assert removed is True
+        assert "DOOMED_KEY" not in load_env()
+    finally:
+        monkeypatch.delenv("DOOMED_KEY", raising=False)
+        invalidate_env_cache()
+
+
+def test_load_env_handles_missing_file():
+    """A nonexistent .env returns {} and caches the empty result."""
+    from hermes_cli.config import invalidate_env_cache, load_env
+
+    invalidate_env_cache()
+
+    nonexistent = Path(tempfile.gettempdir()) / "hermes-test-no-such-env-xyz123.env"
+    nonexistent.unlink(missing_ok=True)
+
+    try:
+        with patch("hermes_cli.config.get_env_path", return_value=nonexistent):
+            assert load_env() == {}
+            assert load_env() == {}  # cached
+    finally:
+        invalidate_env_cache()
--- a/tests/hermes_cli/test_nous_auth_status_cache.py
+++ b/tests/hermes_cli/test_nous_auth_status_cache.py
@ -0,0 +1,144 @@
+"""Tests for the get_nous_auth_status() process-level cache.
+
+The cache avoids re-validating Nous credentials on every menu paint —
+`hermes tools` → "All Platforms" used to fire ~31 OAuth refresh POSTs
+against portal.nousresearch.com during one render. The cache is keyed
+on auth.json mtime so login/logout flows invalidate naturally; tests
+and other writers can also call invalidate_nous_auth_status_cache().
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from unittest.mock import patch
+
+
+def _seed_auth_file(tmp_path):
+    """Drop a placeholder auth.json into the test HERMES_HOME.
+
+    The exact content doesn't matter for cache-key purposes — only that
+    the file exists and we can mutate it to bump mtime.
+    """
+    auth = tmp_path / "auth.json"
+    auth.write_text(json.dumps({"providers": {}}), encoding="utf-8")
+    return auth
+
+
+def test_get_nous_auth_status_caches_consecutive_calls(tmp_path, monkeypatch):
+    """A second call within the TTL skips re-computing the snapshot."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    _seed_auth_file(tmp_path)
+
+    from hermes_cli import auth as auth_mod
+
+    auth_mod.invalidate_nous_auth_status_cache()
+
+    call_count = {"n": 0}
+
+    def fake_compute():
+        call_count["n"] += 1
+        return {"logged_in": False, "source": "auth_store", "call": call_count["n"]}
+
+    with patch.object(auth_mod, "_compute_nous_auth_status", side_effect=fake_compute):
+        first = auth_mod.get_nous_auth_status()
+        second = auth_mod.get_nous_auth_status()
+        third = auth_mod.get_nous_auth_status()
+
+    assert call_count["n"] == 1, (
+        f"_compute_nous_auth_status was called {call_count['n']}× — "
+        "cache is not deduplicating within TTL."
+    )
+    # Each call returns a copy so callers can't mutate the cached dict.
+    assert first == second == third
+    first["mutated"] = True
+    assert "mutated" not in auth_mod.get_nous_auth_status()
+
+    auth_mod.invalidate_nous_auth_status_cache()
+
+
+def test_get_nous_auth_status_invalidates_on_auth_file_mtime(tmp_path, monkeypatch):
+    """Touching auth.json (login/logout) forces a re-compute."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    auth_path = _seed_auth_file(tmp_path)
+
+    from hermes_cli import auth as auth_mod
+
+    auth_mod.invalidate_nous_auth_status_cache()
+
+    call_count = {"n": 0}
+
+    def fake_compute():
+        call_count["n"] += 1
+        return {"logged_in": False, "source": "auth_store", "call": call_count["n"]}
+
+    with patch.object(auth_mod, "_compute_nous_auth_status", side_effect=fake_compute):
+        auth_mod.get_nous_auth_status()
+        # Bump mtime forward so coarse-resolution filesystems still record
+        # a change.
+        future = auth_path.stat().st_mtime + 5.0
+        os.utime(auth_path, (future, future))
+        auth_mod.get_nous_auth_status()
+
+    assert call_count["n"] == 2, (
+        "auth.json mtime change should invalidate the cache, but only "
+        f"{call_count['n']} compute call(s) happened."
+    )
+
+    auth_mod.invalidate_nous_auth_status_cache()
+
+
+def test_invalidate_nous_auth_status_cache_forces_recompute(tmp_path, monkeypatch):
+    """Explicit invalidate forces the next call to re-compute."""
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    _seed_auth_file(tmp_path)
+
+    from hermes_cli import auth as auth_mod
+
+    auth_mod.invalidate_nous_auth_status_cache()
+
+    call_count = {"n": 0}
+
+    def fake_compute():
+        call_count["n"] += 1
+        return {"logged_in": False, "source": "auth_store"}
+
+    with patch.object(auth_mod, "_compute_nous_auth_status", side_effect=fake_compute):
+        auth_mod.get_nous_auth_status()
+        auth_mod.invalidate_nous_auth_status_cache()
+        auth_mod.get_nous_auth_status()
+
+    assert call_count["n"] == 2
+
+    auth_mod.invalidate_nous_auth_status_cache()
+
+
+def test_get_nous_auth_status_caches_failure_path(tmp_path, monkeypatch):
+    """Logged-out snapshots are cached too — that's where the cost was.
+
+    Teknium's case: ~31 cache misses per `hermes tools` "All Platforms"
+    menu paint, all returning logged_in=False after a failed refresh POST.
+    The whole point of the cache is to memoise that failure path too.
+    """
+    monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+    _seed_auth_file(tmp_path)
+
+    from hermes_cli import auth as auth_mod
+
+    auth_mod.invalidate_nous_auth_status_cache()
+
+    call_count = {"n": 0}
+
+    def fake_compute():
+        call_count["n"] += 1
+        return {"logged_in": False, "source": "auth_store", "error": "refresh failed"}
+
+    with patch.object(auth_mod, "_compute_nous_auth_status", side_effect=fake_compute):
+        for _ in range(10):
+            auth_mod.get_nous_auth_status()
+
+    assert call_count["n"] == 1, (
+        f"Logged-out snapshots must cache; got {call_count['n']} computes for 10 calls."
+    )
+
+    auth_mod.invalidate_nous_auth_status_cache()