perf(cli): cut ~19s from 'hermes' cold start (skills cache + lazy Feishu + no Nous HTTP) (#22138)

Interactive `hermes` launch drops from ~21s to ~2.5s. Three independent
fixes, each targets a distinct hot spot in the banner / tool-registration
path that fires on every CLI invocation.

1. `get_external_skills_dirs()` in-process mtime cache (~10s saved)
   The function re-read + YAML-parsed the full ~/.hermes/config.yaml on
   every call. Banner build invokes it once per skill to resolve the
   category column, which on a 120-skill install meant ~120 reparses of
   a 15 KB config (~85 ms each). Added a
   `(config_path, mtime_ns) -> list[Path]` memo; stat() is ~2 us vs
   ~85 ms for the parse. Edits to config.yaml invalidate the cache on
   the next call via mtime.

2. Feishu availability probe uses `importlib.util.find_spec` (~5.2s saved)
   `tools/feishu_doc_tool.py::_check_feishu` and the identical helper in
   `feishu_drive_tool.py` were calling `import lark_oapi` purely to
   detect whether the SDK was installed. Executing the real import pulls
   in websockets + dispatcher + every v2 API model — ~5 seconds of work
   that fires at every tool-registry bootstrap. `find_spec` answers the
   same question ("is lark_oapi importable?") without executing the
   module. The actual tool handlers still do the real import on invoke,
   so runtime behavior is unchanged.

3. `_web_requires_env` no longer triggers Nous portal refresh (~800ms saved)
   `tools/web_tools.py::_web_requires_env` used
   `managed_nous_tools_enabled()` to gate four gateway env-var names in
   the returned list. The gate called `get_nous_auth_status()` ->
   `resolve_nous_runtime_credentials()` -> live HTTP POST to the portal
   on every tool-registry bootstrap. But the list is pure metadata — if
   the env var is set at runtime, the tool lights up; otherwise it
   doesn't. Including the four names unconditionally is harmless for
   unsubscribed users (vars just aren't set) and eliminates the sync
   HTTP round trip from startup.

Test:
- tests/agent/test_external_skills_dirs_cache.py (new, 6 cases):
  returns config'd dir, caches on second call (yaml_load patched to
  raise — never invoked), invalidates on mtime bump, empty when config
  missing, returned list is a defensive copy, per-HERMES_HOME cache key
  isolation.
- Existing tests/agent/test_external_skills.py and tests/tools/
  continue to pass modulo pre-existing flakes on main (test_delegate,
  test_send_message — unrelated, pass in isolation).

Measured: bare `hermes` (cold → REPL ready) 21,519ms -> 2,618ms on
Teknium's install (119 skills, 15 KB config.yaml, Nous auth logged in,
lark_oapi installed). 8x faster.
This commit is contained in:
Teknium 2026-05-08 16:39:32 -07:00 committed by GitHub
parent d606df8126
commit 0ec052ca24
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 220 additions and 20 deletions

View file

@ -170,6 +170,19 @@ def _normalize_string_set(values) -> Set[str]:
# ── External skills directories ──────────────────────────────────────────
# (config_path_str, mtime_ns) -> resolved external dirs list. Keyed by
# mtime_ns so a config.yaml edit mid-run is picked up automatically;
# otherwise every call would re-read + re-YAML-parse the 15KB config,
# which becomes the dominant cost of ``hermes`` startup when ~120 skills
# each trigger a category lookup during banner construction (10+ seconds
# of pure waste).
_EXTERNAL_DIRS_CACHE: Dict[Tuple[str, int], List[Path]] = {}
def _external_dirs_cache_clear() -> None:
"""Test hook — drop the in-process cache."""
_EXTERNAL_DIRS_CACHE.clear()
def get_external_skills_dirs() -> List[Path]:
"""Read ``skills.external_dirs`` from config.yaml and return validated paths.
@ -177,10 +190,30 @@ def get_external_skills_dirs() -> List[Path]:
Each entry is expanded (``~`` and ``${VAR}``) and resolved to an absolute
path. Only directories that actually exist are returned. Duplicates and
paths that resolve to the local ``~/.hermes/skills/`` are silently skipped.
Cached in-process, keyed on ``config.yaml`` mtime the function is
called once per skill during banner / tool-registry scans, and YAML
parsing a non-trivial config dominates ``hermes`` cold-start time
when the cache is absent.
"""
config_path = get_config_path()
if not config_path.exists():
return []
# Cache key: (absolute path, mtime_ns). stat() is ~2us vs ~85ms for
# the full YAML parse, so the fast path is nearly free.
try:
stat = config_path.stat()
cache_key: Tuple[str, int] = (str(config_path), stat.st_mtime_ns)
except OSError:
cache_key = None # type: ignore[assignment]
if cache_key is not None:
cached = _EXTERNAL_DIRS_CACHE.get(cache_key)
if cached is not None:
# Return a copy so callers can't mutate the cached list.
return list(cached)
try:
parsed = yaml_load(config_path.read_text(encoding="utf-8"))
except Exception:
@ -194,7 +227,10 @@ def get_external_skills_dirs() -> List[Path]:
raw_dirs = skills_cfg.get("external_dirs")
if not raw_dirs:
return []
result: List[Path] = []
if cache_key is not None:
_EXTERNAL_DIRS_CACHE[cache_key] = list(result)
return result
if isinstance(raw_dirs, str):
raw_dirs = [raw_dirs]
if not isinstance(raw_dirs, list):
@ -205,7 +241,7 @@ def get_external_skills_dirs() -> List[Path]:
hermes_home = get_hermes_home()
local_skills = get_skills_dir().resolve()
seen: Set[Path] = set()
result: List[Path] = []
result = []
for entry in raw_dirs:
entry = str(entry).strip()
@ -229,6 +265,8 @@ def get_external_skills_dirs() -> List[Path]:
else:
logger.debug("External skills dir does not exist, skipping: %s", p)
if cache_key is not None:
_EXTERNAL_DIRS_CACHE[cache_key] = list(result)
return result

View file

@ -0,0 +1,149 @@
"""Guards for ``get_external_skills_dirs`` mtime-based memo.
``get_external_skills_dirs()`` is called once per skill during banner
construction and tool registration on a typical install that's 120+
calls. Without caching, each call re-reads + YAML-parses the full
config.yaml (~85ms each, 10+ seconds total). This test pins the
behavior: first call parses, subsequent calls return cached result,
cache invalidates when config.yaml's mtime changes.
"""
from __future__ import annotations
import os
import time
from pathlib import Path
from unittest.mock import patch
import pytest
from agent import skill_utils
from agent.skill_utils import (
_external_dirs_cache_clear,
get_external_skills_dirs,
)
@pytest.fixture
def hermes_home_with_config(tmp_path, monkeypatch):
"""Isolated ``~/.hermes/`` with a config.yaml referencing one external dir."""
home = tmp_path / ".hermes"
home.mkdir()
external = tmp_path / "external_skills"
external.mkdir()
config = home / "config.yaml"
config.write_text(
"skills:\n"
f" external_dirs:\n"
f" - {external}\n",
encoding="utf-8",
)
monkeypatch.setenv("HERMES_HOME", str(home))
monkeypatch.setattr(Path, "home", lambda: tmp_path)
_external_dirs_cache_clear()
yield home, external, config
_external_dirs_cache_clear()
def test_returns_configured_external_dir(hermes_home_with_config):
_home, external, _cfg = hermes_home_with_config
result = get_external_skills_dirs()
assert result == [external.resolve()]
def test_cache_reuses_result_without_reparsing(hermes_home_with_config):
"""Subsequent calls hit the cache and skip YAML parsing entirely."""
_home, _external, _cfg = hermes_home_with_config
# Prime cache
get_external_skills_dirs()
# Patch yaml_load to raise — if cache works, it's never called again.
with patch.object(
skill_utils,
"yaml_load",
side_effect=AssertionError("yaml_load should not run on cache hit"),
):
# Many calls, none should trigger the patched yaml_load.
for _ in range(100):
get_external_skills_dirs()
def test_cache_invalidates_on_mtime_change(hermes_home_with_config):
"""A config.yaml edit invalidates the cache on the next call."""
_home, external, config = hermes_home_with_config
other = external.parent / "other_skills"
other.mkdir()
# Prime cache with original contents.
first = get_external_skills_dirs()
assert first == [external.resolve()]
# Rewrite config; bump mtime forward explicitly so filesystems with
# coarse mtime granularity still register the change on fast test
# systems.
config.write_text(
"skills:\n"
f" external_dirs:\n"
f" - {other}\n",
encoding="utf-8",
)
stat = config.stat()
future = stat.st_atime + 10
os.utime(config, (future, future))
second = get_external_skills_dirs()
assert second == [other.resolve()]
def test_returns_empty_when_config_missing(tmp_path, monkeypatch):
"""No config file → empty list, cached as empty."""
home = tmp_path / ".hermes"
home.mkdir()
monkeypatch.setenv("HERMES_HOME", str(home))
monkeypatch.setattr(Path, "home", lambda: tmp_path)
_external_dirs_cache_clear()
assert get_external_skills_dirs() == []
def test_returned_list_is_a_copy(hermes_home_with_config):
"""Callers can't poison the cache by mutating the returned list."""
first = get_external_skills_dirs()
first.append(Path("/tmp/should-not-persist"))
second = get_external_skills_dirs()
assert Path("/tmp/should-not-persist") not in second
def test_cache_key_is_per_config_path(tmp_path, monkeypatch):
"""Two different HERMES_HOMEs keep separate cache entries."""
home_a = tmp_path / "home_a" / ".hermes"
home_a.mkdir(parents=True)
ext_a = tmp_path / "ext_a"
ext_a.mkdir()
(home_a / "config.yaml").write_text(
f"skills:\n external_dirs:\n - {ext_a}\n", encoding="utf-8"
)
home_b = tmp_path / "home_b" / ".hermes"
home_b.mkdir(parents=True)
ext_b = tmp_path / "ext_b"
ext_b.mkdir()
(home_b / "config.yaml").write_text(
f"skills:\n external_dirs:\n - {ext_b}\n", encoding="utf-8"
)
_external_dirs_cache_clear()
monkeypatch.setenv("HERMES_HOME", str(home_a))
assert get_external_skills_dirs() == [ext_a.resolve()]
monkeypatch.setenv("HERMES_HOME", str(home_b))
assert get_external_skills_dirs() == [ext_b.resolve()]
# And switching back still works — both entries coexist in the cache.
monkeypatch.setenv("HERMES_HOME", str(home_a))
assert get_external_skills_dirs() == [ext_a.resolve()]

View file

@ -52,10 +52,17 @@ FEISHU_DOC_READ_SCHEMA = {
def _check_feishu():
# Use ``importlib.util.find_spec`` — it checks whether ``lark_oapi``
# is importable without actually executing its ``__init__``.
# Executing the real import here costs ~5 seconds (the SDK eagerly
# loads websockets, dispatcher, every api/v2 model) and this probe
# fires at every ``hermes`` startup during tool-availability
# evaluation. Correctness is preserved because the actual tool
# handler still does the real import when invoked.
import importlib.util
try:
import lark_oapi # noqa: F401
return True
except ImportError:
return importlib.util.find_spec("lark_oapi") is not None
except (ImportError, ValueError):
return False

View file

@ -28,10 +28,12 @@ def get_client():
def _check_feishu():
# See ``tools/feishu_doc_tool.py::_check_feishu`` — ``find_spec`` keeps
# CLI startup fast (the SDK itself takes ~5s to import eagerly).
import importlib.util
try:
import lark_oapi # noqa: F401
return True
except ImportError:
return importlib.util.find_spec("lark_oapi") is not None
except (ImportError, ValueError):
return False

View file

@ -284,24 +284,28 @@ def _firecrawl_backend_help_suffix() -> str:
def _web_requires_env() -> list[str]:
"""Return tool metadata env vars for the currently enabled web backends."""
requires = [
"""Return tool metadata env vars for the currently enabled web backends.
The gateway env vars are always reported they're metadata strings
used by the tool registry to light up the tool when the variable is
set. Gating them on ``managed_nous_tools_enabled()`` only saved
string noise in the metadata list, but cost a synchronous HTTP
refresh against the Nous portal on every CLI startup (invoked at
tool-registration time). The behavioral contract is: if the env var
is set, the tool sees it; if not, it doesn't. Not-logged-in users
simply don't have the vars set, so the extra entries are harmless.
"""
return [
"EXA_API_KEY",
"PARALLEL_API_KEY",
"TAVILY_API_KEY",
"FIRECRAWL_API_KEY",
"FIRECRAWL_API_URL",
"FIRECRAWL_GATEWAY_URL",
"TOOL_GATEWAY_DOMAIN",
"TOOL_GATEWAY_SCHEME",
"TOOL_GATEWAY_USER_TOKEN",
]
if managed_nous_tools_enabled():
requires.extend(
[
"FIRECRAWL_GATEWAY_URL",
"TOOL_GATEWAY_DOMAIN",
"TOOL_GATEWAY_SCHEME",
"TOOL_GATEWAY_USER_TOKEN",
]
)
return requires
def _get_firecrawl_client():