mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
perf(/model): prewarm picker provider-models cache in background (#39847)
* fix: respect disabled auto-compaction on context overflow Port from anomalyco/opencode#30749. When compression.enabled is false, NO automatic compaction trigger may fire. The proactive token-threshold paths (preflight + post-response should_compress gate) already honoured the setting, but the three provider-overflow recovery paths in the agent loop — long-context-tier 429, 413 payload-too-large, and context-overflow — called _compress_context() unconditionally, silently compressing and rotating the session against the user's explicit choice. Add a single guard at the top of the overflow-recovery dispatch: when compression is disabled and the error is one of those three overflow classes, surface a terminal error (compaction_disabled: True) telling the user to /compress manually, /new, switch to a larger-context model, or reduce attachments. Manual /compress (force=True) is unaffected — it never enters this loop. Tests: new TestOverflowWithCompactionDisabled (413 + 400 overflow don't compress when disabled; control case still compresses when enabled). Existing overflow-recovery tests updated to enable compaction explicitly (they verify the recovery fires); fixture defaults flipped to True to match production (compression.enabled defaults to True). * perf(/model): prewarm picker provider-models cache in background The no-args /model picker calls list_authenticated_providers(), which fetches each authenticated provider's live /v1/models list serially. On a cold or stale (>1h TTL) cache that blocks ~1.5s on the user's critical path the first time /model is opened in a session. Warm that exact path off-thread during the idle window right after the CLI banner is shown: a once-per-process daemon thread runs list_authenticated_providers() to populate provider_models_cache.json for every authed provider. By the time the user types /model, the picker hits the warm disk cache (~136ms vs ~1500ms). Process-level Event guard (mirrors run_agent's _openrouter_prewarm_done) ensures at most one thread per process; fully exception-isolated so an offline/no-creds provider can never affect the session.
This commit is contained in:
parent
ca1fb32c26
commit
9ca11b35d5
3 changed files with 126 additions and 0 deletions
10
cli.py
10
cli.py
|
|
@ -13094,6 +13094,16 @@ class HermesCLI:
|
|||
_welcome_color = "#FFF8DC"
|
||||
self._console_print(f"[{_welcome_color}]{_welcome_text}[/]")
|
||||
|
||||
# Warm the /model picker's provider-models cache off-thread during this
|
||||
# idle window (banner shown, user about to type). The no-args picker
|
||||
# otherwise blocks ~1-2s on serial /v1/models fetches the first time
|
||||
# it's opened in a session. Fire-and-forget, guarded once-per-process.
|
||||
try:
|
||||
from hermes_cli.model_switch import prewarm_picker_cache_async
|
||||
prewarm_picker_cache_async()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Redaction opt-out warning (#17691): ON by default, loud when off.
|
||||
# The redactor snapshots its state at import time so any toggle now
|
||||
# won't affect the running process — we just want the operator to
|
||||
|
|
|
|||
|
|
@ -1117,6 +1117,62 @@ def switch_model(
|
|||
# Authenticated providers listing (for /model no-args display)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Process-level guard so the picker prewarm thread is spawned at most once per
|
||||
# process — mirrors run_agent's _openrouter_prewarm_done. Without a guard a
|
||||
# long-lived process (or repeated triggers) would leak one OS thread per call.
|
||||
import threading as _threading # noqa: E402
|
||||
|
||||
_picker_prewarm_done = _threading.Event()
|
||||
|
||||
|
||||
def prewarm_picker_cache_async() -> Optional["_threading.Thread"]:
|
||||
"""Warm the provider-models disk cache in a background daemon thread.
|
||||
|
||||
The no-args ``/model`` picker calls ``list_authenticated_providers()``,
|
||||
which fetches each authenticated provider's live ``/v1/models`` list on a
|
||||
cold/stale cache. Those fetches are independent HTTP round-trips but run
|
||||
serially, so the first ``/model`` open in a session (or any open after the
|
||||
1h cache TTL expires) blocks ~1-2s on the user's critical path.
|
||||
|
||||
This pre-warms that exact path off-thread during idle session time: it
|
||||
runs ``list_authenticated_providers()`` once, which populates
|
||||
``provider_models_cache.json`` for every authed provider. By the time the
|
||||
user types ``/model``, the picker hits the warm disk cache and renders in
|
||||
~100ms.
|
||||
|
||||
Fire-and-forget. Process-level Event guard ensures it runs at most once.
|
||||
Fully exception-isolated — a slow or offline provider can never affect the
|
||||
session. Returns the spawned thread (for tests) or None if already warmed.
|
||||
"""
|
||||
if _picker_prewarm_done.is_set():
|
||||
return None
|
||||
_picker_prewarm_done.set()
|
||||
|
||||
def _warm() -> None:
|
||||
try:
|
||||
from hermes_cli.inventory import load_picker_context
|
||||
|
||||
ctx = load_picker_context()
|
||||
# Calling this is what populates cached_provider_model_ids() ->
|
||||
# provider_models_cache.json for each authed provider. We discard
|
||||
# the result; the side effect (warm disk cache) is the point.
|
||||
list_authenticated_providers(
|
||||
current_provider=ctx.current_provider,
|
||||
current_base_url=ctx.current_base_url,
|
||||
current_model=ctx.current_model,
|
||||
user_providers=ctx.user_providers,
|
||||
custom_providers=ctx.custom_providers,
|
||||
max_models=50,
|
||||
)
|
||||
except Exception:
|
||||
# Best-effort warmup — never surface errors into the session.
|
||||
logger.debug("picker cache prewarm failed", exc_info=True)
|
||||
|
||||
t = _threading.Thread(target=_warm, daemon=True, name="picker-cache-prewarm")
|
||||
t.start()
|
||||
return t
|
||||
|
||||
|
||||
def list_authenticated_providers(
|
||||
current_provider: str = "",
|
||||
current_base_url: str = "",
|
||||
|
|
|
|||
60
tests/hermes_cli/test_picker_prewarm.py
Normal file
60
tests/hermes_cli/test_picker_prewarm.py
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
"""Tests for the /model picker background cache prewarm.
|
||||
|
||||
``prewarm_picker_cache_async()`` warms the provider-models disk cache off the
|
||||
user's critical path so the first ``/model`` open in a session is fast instead
|
||||
of blocking ~1-2s on serial /v1/models fetches. These pin the two contracts
|
||||
that matter: it runs the warm path exactly once per process (no thread leak),
|
||||
and it delegates to ``list_authenticated_providers`` to do the warming.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import patch
|
||||
|
||||
import hermes_cli.model_switch as ms
|
||||
|
||||
|
||||
def _reset_guard():
|
||||
ms._picker_prewarm_done.clear()
|
||||
|
||||
|
||||
def test_prewarm_runs_list_authenticated_providers_once():
|
||||
"""First call spawns a thread that calls list_authenticated_providers;
|
||||
the warm side effect is delegated there (which disk-caches per provider)."""
|
||||
_reset_guard()
|
||||
with patch.object(ms, "list_authenticated_providers", return_value=[]) as mock_list:
|
||||
t = ms.prewarm_picker_cache_async()
|
||||
assert t is not None, "first call must spawn a prewarm thread"
|
||||
t.join(timeout=10)
|
||||
assert not t.is_alive(), "prewarm thread should finish promptly"
|
||||
mock_list.assert_called_once()
|
||||
_reset_guard()
|
||||
|
||||
|
||||
def test_prewarm_guard_is_once_per_process():
|
||||
"""The process-level Event guard must make repeat calls no-ops so a
|
||||
long-lived process never leaks one OS thread per call."""
|
||||
_reset_guard()
|
||||
with patch.object(ms, "list_authenticated_providers", return_value=[]):
|
||||
t1 = ms.prewarm_picker_cache_async()
|
||||
assert t1 is not None
|
||||
t1.join(timeout=10)
|
||||
# Subsequent calls return None (guard set) — no new thread.
|
||||
assert ms.prewarm_picker_cache_async() is None
|
||||
assert ms.prewarm_picker_cache_async() is None
|
||||
_reset_guard()
|
||||
|
||||
|
||||
def test_prewarm_never_raises_on_failure():
|
||||
"""A failing/offline provider path must be fully swallowed — the prewarm
|
||||
is best-effort and must never surface errors into the session."""
|
||||
_reset_guard()
|
||||
with patch.object(
|
||||
ms, "list_authenticated_providers", side_effect=RuntimeError("boom")
|
||||
):
|
||||
t = ms.prewarm_picker_cache_async()
|
||||
assert t is not None
|
||||
# join must not raise; the worker swallows the exception internally.
|
||||
t.join(timeout=10)
|
||||
assert not t.is_alive()
|
||||
_reset_guard()
|
||||
Loading…
Add table
Add a link
Reference in a new issue