mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-09 03:11:58 +00:00
perf(cli): cut ~19s from 'hermes' cold start (skills cache + lazy Feishu + no Nous HTTP) (#22138)
Interactive `hermes` launch drops from ~21s to ~2.5s. Three independent
fixes, each targets a distinct hot spot in the banner / tool-registration
path that fires on every CLI invocation.
1. `get_external_skills_dirs()` in-process mtime cache (~10s saved)
The function re-read + YAML-parsed the full ~/.hermes/config.yaml on
every call. Banner build invokes it once per skill to resolve the
category column, which on a 120-skill install meant ~120 reparses of
a 15 KB config (~85 ms each). Added a
`(config_path, mtime_ns) -> list[Path]` memo; stat() is ~2 us vs
~85 ms for the parse. Edits to config.yaml invalidate the cache on
the next call via mtime.
2. Feishu availability probe uses `importlib.util.find_spec` (~5.2s saved)
`tools/feishu_doc_tool.py::_check_feishu` and the identical helper in
`feishu_drive_tool.py` were calling `import lark_oapi` purely to
detect whether the SDK was installed. Executing the real import pulls
in websockets + dispatcher + every v2 API model — ~5 seconds of work
that fires at every tool-registry bootstrap. `find_spec` answers the
same question ("is lark_oapi importable?") without executing the
module. The actual tool handlers still do the real import on invoke,
so runtime behavior is unchanged.
3. `_web_requires_env` no longer triggers Nous portal refresh (~800ms saved)
`tools/web_tools.py::_web_requires_env` used
`managed_nous_tools_enabled()` to gate four gateway env-var names in
the returned list. The gate called `get_nous_auth_status()` ->
`resolve_nous_runtime_credentials()` -> live HTTP POST to the portal
on every tool-registry bootstrap. But the list is pure metadata — if
the env var is set at runtime, the tool lights up; otherwise it
doesn't. Including the four names unconditionally is harmless for
unsubscribed users (vars just aren't set) and eliminates the sync
HTTP round trip from startup.
Test:
- tests/agent/test_external_skills_dirs_cache.py (new, 6 cases):
returns config'd dir, caches on second call (yaml_load patched to
raise — never invoked), invalidates on mtime bump, empty when config
missing, returned list is a defensive copy, per-HERMES_HOME cache key
isolation.
- Existing tests/agent/test_external_skills.py and tests/tools/
continue to pass modulo pre-existing flakes on main (test_delegate,
test_send_message — unrelated, pass in isolation).
Measured: bare `hermes` (cold → REPL ready) 21,519ms -> 2,618ms on
Teknium's install (119 skills, 15 KB config.yaml, Nous auth logged in,
lark_oapi installed). 8x faster.
This commit is contained in:
parent
d606df8126
commit
0ec052ca24
5 changed files with 220 additions and 20 deletions
|
|
@ -170,6 +170,19 @@ def _normalize_string_set(values) -> Set[str]:
|
|||
|
||||
# ── External skills directories ──────────────────────────────────────────
|
||||
|
||||
# (config_path_str, mtime_ns) -> resolved external dirs list. Keyed by
|
||||
# mtime_ns so a config.yaml edit mid-run is picked up automatically;
|
||||
# otherwise every call would re-read + re-YAML-parse the 15KB config,
|
||||
# which becomes the dominant cost of ``hermes`` startup when ~120 skills
|
||||
# each trigger a category lookup during banner construction (10+ seconds
|
||||
# of pure waste).
|
||||
_EXTERNAL_DIRS_CACHE: Dict[Tuple[str, int], List[Path]] = {}
|
||||
|
||||
|
||||
def _external_dirs_cache_clear() -> None:
|
||||
"""Test hook — drop the in-process cache."""
|
||||
_EXTERNAL_DIRS_CACHE.clear()
|
||||
|
||||
|
||||
def get_external_skills_dirs() -> List[Path]:
|
||||
"""Read ``skills.external_dirs`` from config.yaml and return validated paths.
|
||||
|
|
@ -177,10 +190,30 @@ def get_external_skills_dirs() -> List[Path]:
|
|||
Each entry is expanded (``~`` and ``${VAR}``) and resolved to an absolute
|
||||
path. Only directories that actually exist are returned. Duplicates and
|
||||
paths that resolve to the local ``~/.hermes/skills/`` are silently skipped.
|
||||
|
||||
Cached in-process, keyed on ``config.yaml`` mtime — the function is
|
||||
called once per skill during banner / tool-registry scans, and YAML
|
||||
parsing a non-trivial config dominates ``hermes`` cold-start time
|
||||
when the cache is absent.
|
||||
"""
|
||||
config_path = get_config_path()
|
||||
if not config_path.exists():
|
||||
return []
|
||||
|
||||
# Cache key: (absolute path, mtime_ns). stat() is ~2us vs ~85ms for
|
||||
# the full YAML parse, so the fast path is nearly free.
|
||||
try:
|
||||
stat = config_path.stat()
|
||||
cache_key: Tuple[str, int] = (str(config_path), stat.st_mtime_ns)
|
||||
except OSError:
|
||||
cache_key = None # type: ignore[assignment]
|
||||
|
||||
if cache_key is not None:
|
||||
cached = _EXTERNAL_DIRS_CACHE.get(cache_key)
|
||||
if cached is not None:
|
||||
# Return a copy so callers can't mutate the cached list.
|
||||
return list(cached)
|
||||
|
||||
try:
|
||||
parsed = yaml_load(config_path.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
|
|
@ -194,7 +227,10 @@ def get_external_skills_dirs() -> List[Path]:
|
|||
|
||||
raw_dirs = skills_cfg.get("external_dirs")
|
||||
if not raw_dirs:
|
||||
return []
|
||||
result: List[Path] = []
|
||||
if cache_key is not None:
|
||||
_EXTERNAL_DIRS_CACHE[cache_key] = list(result)
|
||||
return result
|
||||
if isinstance(raw_dirs, str):
|
||||
raw_dirs = [raw_dirs]
|
||||
if not isinstance(raw_dirs, list):
|
||||
|
|
@ -205,7 +241,7 @@ def get_external_skills_dirs() -> List[Path]:
|
|||
hermes_home = get_hermes_home()
|
||||
local_skills = get_skills_dir().resolve()
|
||||
seen: Set[Path] = set()
|
||||
result: List[Path] = []
|
||||
result = []
|
||||
|
||||
for entry in raw_dirs:
|
||||
entry = str(entry).strip()
|
||||
|
|
@ -229,6 +265,8 @@ def get_external_skills_dirs() -> List[Path]:
|
|||
else:
|
||||
logger.debug("External skills dir does not exist, skipping: %s", p)
|
||||
|
||||
if cache_key is not None:
|
||||
_EXTERNAL_DIRS_CACHE[cache_key] = list(result)
|
||||
return result
|
||||
|
||||
|
||||
|
|
|
|||
149
tests/agent/test_external_skills_dirs_cache.py
Normal file
149
tests/agent/test_external_skills_dirs_cache.py
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
"""Guards for ``get_external_skills_dirs`` mtime-based memo.
|
||||
|
||||
``get_external_skills_dirs()`` is called once per skill during banner
|
||||
construction and tool registration — on a typical install that's 120+
|
||||
calls. Without caching, each call re-reads + YAML-parses the full
|
||||
config.yaml (~85ms each, 10+ seconds total). This test pins the
|
||||
behavior: first call parses, subsequent calls return cached result,
|
||||
cache invalidates when config.yaml's mtime changes.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from agent import skill_utils
|
||||
from agent.skill_utils import (
|
||||
_external_dirs_cache_clear,
|
||||
get_external_skills_dirs,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def hermes_home_with_config(tmp_path, monkeypatch):
|
||||
"""Isolated ``~/.hermes/`` with a config.yaml referencing one external dir."""
|
||||
home = tmp_path / ".hermes"
|
||||
home.mkdir()
|
||||
external = tmp_path / "external_skills"
|
||||
external.mkdir()
|
||||
|
||||
config = home / "config.yaml"
|
||||
config.write_text(
|
||||
"skills:\n"
|
||||
f" external_dirs:\n"
|
||||
f" - {external}\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
monkeypatch.setenv("HERMES_HOME", str(home))
|
||||
monkeypatch.setattr(Path, "home", lambda: tmp_path)
|
||||
_external_dirs_cache_clear()
|
||||
yield home, external, config
|
||||
_external_dirs_cache_clear()
|
||||
|
||||
|
||||
def test_returns_configured_external_dir(hermes_home_with_config):
|
||||
_home, external, _cfg = hermes_home_with_config
|
||||
result = get_external_skills_dirs()
|
||||
assert result == [external.resolve()]
|
||||
|
||||
|
||||
def test_cache_reuses_result_without_reparsing(hermes_home_with_config):
|
||||
"""Subsequent calls hit the cache and skip YAML parsing entirely."""
|
||||
_home, _external, _cfg = hermes_home_with_config
|
||||
|
||||
# Prime cache
|
||||
get_external_skills_dirs()
|
||||
|
||||
# Patch yaml_load to raise — if cache works, it's never called again.
|
||||
with patch.object(
|
||||
skill_utils,
|
||||
"yaml_load",
|
||||
side_effect=AssertionError("yaml_load should not run on cache hit"),
|
||||
):
|
||||
# Many calls, none should trigger the patched yaml_load.
|
||||
for _ in range(100):
|
||||
get_external_skills_dirs()
|
||||
|
||||
|
||||
def test_cache_invalidates_on_mtime_change(hermes_home_with_config):
|
||||
"""A config.yaml edit invalidates the cache on the next call."""
|
||||
_home, external, config = hermes_home_with_config
|
||||
other = external.parent / "other_skills"
|
||||
other.mkdir()
|
||||
|
||||
# Prime cache with original contents.
|
||||
first = get_external_skills_dirs()
|
||||
assert first == [external.resolve()]
|
||||
|
||||
# Rewrite config; bump mtime forward explicitly so filesystems with
|
||||
# coarse mtime granularity still register the change on fast test
|
||||
# systems.
|
||||
config.write_text(
|
||||
"skills:\n"
|
||||
f" external_dirs:\n"
|
||||
f" - {other}\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
stat = config.stat()
|
||||
future = stat.st_atime + 10
|
||||
os.utime(config, (future, future))
|
||||
|
||||
second = get_external_skills_dirs()
|
||||
assert second == [other.resolve()]
|
||||
|
||||
|
||||
def test_returns_empty_when_config_missing(tmp_path, monkeypatch):
|
||||
"""No config file → empty list, cached as empty."""
|
||||
home = tmp_path / ".hermes"
|
||||
home.mkdir()
|
||||
monkeypatch.setenv("HERMES_HOME", str(home))
|
||||
monkeypatch.setattr(Path, "home", lambda: tmp_path)
|
||||
_external_dirs_cache_clear()
|
||||
|
||||
assert get_external_skills_dirs() == []
|
||||
|
||||
|
||||
def test_returned_list_is_a_copy(hermes_home_with_config):
|
||||
"""Callers can't poison the cache by mutating the returned list."""
|
||||
first = get_external_skills_dirs()
|
||||
first.append(Path("/tmp/should-not-persist"))
|
||||
|
||||
second = get_external_skills_dirs()
|
||||
assert Path("/tmp/should-not-persist") not in second
|
||||
|
||||
|
||||
def test_cache_key_is_per_config_path(tmp_path, monkeypatch):
|
||||
"""Two different HERMES_HOMEs keep separate cache entries."""
|
||||
home_a = tmp_path / "home_a" / ".hermes"
|
||||
home_a.mkdir(parents=True)
|
||||
ext_a = tmp_path / "ext_a"
|
||||
ext_a.mkdir()
|
||||
(home_a / "config.yaml").write_text(
|
||||
f"skills:\n external_dirs:\n - {ext_a}\n", encoding="utf-8"
|
||||
)
|
||||
|
||||
home_b = tmp_path / "home_b" / ".hermes"
|
||||
home_b.mkdir(parents=True)
|
||||
ext_b = tmp_path / "ext_b"
|
||||
ext_b.mkdir()
|
||||
(home_b / "config.yaml").write_text(
|
||||
f"skills:\n external_dirs:\n - {ext_b}\n", encoding="utf-8"
|
||||
)
|
||||
|
||||
_external_dirs_cache_clear()
|
||||
|
||||
monkeypatch.setenv("HERMES_HOME", str(home_a))
|
||||
assert get_external_skills_dirs() == [ext_a.resolve()]
|
||||
|
||||
monkeypatch.setenv("HERMES_HOME", str(home_b))
|
||||
assert get_external_skills_dirs() == [ext_b.resolve()]
|
||||
|
||||
# And switching back still works — both entries coexist in the cache.
|
||||
monkeypatch.setenv("HERMES_HOME", str(home_a))
|
||||
assert get_external_skills_dirs() == [ext_a.resolve()]
|
||||
|
|
@ -52,10 +52,17 @@ FEISHU_DOC_READ_SCHEMA = {
|
|||
|
||||
|
||||
def _check_feishu():
|
||||
# Use ``importlib.util.find_spec`` — it checks whether ``lark_oapi``
|
||||
# is importable without actually executing its ``__init__``.
|
||||
# Executing the real import here costs ~5 seconds (the SDK eagerly
|
||||
# loads websockets, dispatcher, every api/v2 model) and this probe
|
||||
# fires at every ``hermes`` startup during tool-availability
|
||||
# evaluation. Correctness is preserved because the actual tool
|
||||
# handler still does the real import when invoked.
|
||||
import importlib.util
|
||||
try:
|
||||
import lark_oapi # noqa: F401
|
||||
return True
|
||||
except ImportError:
|
||||
return importlib.util.find_spec("lark_oapi") is not None
|
||||
except (ImportError, ValueError):
|
||||
return False
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -28,10 +28,12 @@ def get_client():
|
|||
|
||||
|
||||
def _check_feishu():
|
||||
# See ``tools/feishu_doc_tool.py::_check_feishu`` — ``find_spec`` keeps
|
||||
# CLI startup fast (the SDK itself takes ~5s to import eagerly).
|
||||
import importlib.util
|
||||
try:
|
||||
import lark_oapi # noqa: F401
|
||||
return True
|
||||
except ImportError:
|
||||
return importlib.util.find_spec("lark_oapi") is not None
|
||||
except (ImportError, ValueError):
|
||||
return False
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -284,24 +284,28 @@ def _firecrawl_backend_help_suffix() -> str:
|
|||
|
||||
|
||||
def _web_requires_env() -> list[str]:
|
||||
"""Return tool metadata env vars for the currently enabled web backends."""
|
||||
requires = [
|
||||
"""Return tool metadata env vars for the currently enabled web backends.
|
||||
|
||||
The gateway env vars are always reported — they're metadata strings
|
||||
used by the tool registry to light up the tool when the variable is
|
||||
set. Gating them on ``managed_nous_tools_enabled()`` only saved
|
||||
string noise in the metadata list, but cost a synchronous HTTP
|
||||
refresh against the Nous portal on every CLI startup (invoked at
|
||||
tool-registration time). The behavioral contract is: if the env var
|
||||
is set, the tool sees it; if not, it doesn't. Not-logged-in users
|
||||
simply don't have the vars set, so the extra entries are harmless.
|
||||
"""
|
||||
return [
|
||||
"EXA_API_KEY",
|
||||
"PARALLEL_API_KEY",
|
||||
"TAVILY_API_KEY",
|
||||
"FIRECRAWL_API_KEY",
|
||||
"FIRECRAWL_API_URL",
|
||||
"FIRECRAWL_GATEWAY_URL",
|
||||
"TOOL_GATEWAY_DOMAIN",
|
||||
"TOOL_GATEWAY_SCHEME",
|
||||
"TOOL_GATEWAY_USER_TOKEN",
|
||||
]
|
||||
if managed_nous_tools_enabled():
|
||||
requires.extend(
|
||||
[
|
||||
"FIRECRAWL_GATEWAY_URL",
|
||||
"TOOL_GATEWAY_DOMAIN",
|
||||
"TOOL_GATEWAY_SCHEME",
|
||||
"TOOL_GATEWAY_USER_TOKEN",
|
||||
]
|
||||
)
|
||||
return requires
|
||||
|
||||
|
||||
def _get_firecrawl_client():
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue