Merge pull request #17190 from NousResearch/bb/tui-cold-start-profiling

perf(tui): cut visible cold start ~57% with lazy agent init
This commit is contained in:
brooklyn! 2026-04-28 22:45:14 -07:00 committed by GitHub
commit 5e68503d2f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 309 additions and 161 deletions

View file

@ -770,11 +770,19 @@ class TestLoadConfig(unittest.TestCase):
def test_returns_code_execution_section(self):
from tools.code_execution_tool import _load_config
mock_cli = MagicMock()
mock_cli.CLI_CONFIG = {"code_execution": {"timeout": 120, "max_tool_calls": 10}}
with patch.dict("sys.modules", {"cli": mock_cli}):
with patch("hermes_cli.config.read_raw_config",
return_value={"code_execution": {"timeout": 120, "max_tool_calls": 10}}):
result = _load_config()
self.assertIsInstance(result, dict)
self.assertEqual(result, {"timeout": 120, "max_tool_calls": 10})
def test_does_not_import_interactive_cli(self):
from tools.code_execution_tool import _load_config
mock_cli = MagicMock()
mock_cli.CLI_CONFIG = {"code_execution": {"timeout": 999}}
with patch.dict("sys.modules", {"cli": mock_cli}), \
patch("hermes_cli.config.read_raw_config", return_value={}):
result = _load_config()
self.assertEqual(result, {})
# ---------------------------------------------------------------------------

View file

@ -1309,10 +1309,20 @@ def _kill_process_group(proc, escalate: bool = False):
def _load_config() -> dict:
"""Load code_execution config from CLI_CONFIG if available."""
"""Load code_execution config without importing the interactive CLI.
This helper is called while building the module-level execute_code schema
during tool discovery. Importing ``cli`` here pulls prompt_toolkit/Rich and
a large chunk of the classic REPL onto every agent startup path, including
``hermes --tui`` where it is never used. Read the lightweight raw config
instead; the config layer already caches by (mtime, size), and an absent
key cleanly falls back to DEFAULT_EXECUTION_MODE.
"""
try:
from cli import CLI_CONFIG
return CLI_CONFIG.get("code_execution", {})
from hermes_cli.config import read_raw_config
cfg = read_raw_config().get("code_execution", {})
return cfg if isinstance(cfg, dict) else {}
except Exception:
return {}

View file

@ -165,11 +165,29 @@ def main():
# a model_tools.py module-level side effect; moved to explicit
# startup calls to avoid freezing the gateway's loop on lazy import
# (#16856).
#
# Cold-start guard: importing ``tools.mcp_tool`` transitively pulls the
# full MCP SDK (mcp, pydantic, httpx, jsonschema, starlette parsers —
# ~200ms on macOS), which runs on the TUI's critical path before
# ``gateway.ready`` can be emitted. The overwhelming majority of users
# have no ``mcp_servers`` configured, in which case every byte of that
# import is wasted. Check the config first (cheap — it's already been
# loaded once by ``_config_mtime`` elsewhere) and only pay the import
# cost when there's actually MCP work to do.
try:
from tools.mcp_tool import discover_mcp_tools
discover_mcp_tools()
from hermes_cli.config import read_raw_config
_mcp_servers = (read_raw_config() or {}).get("mcp_servers")
_has_mcp_servers = isinstance(_mcp_servers, dict) and len(_mcp_servers) > 0
except Exception:
pass
# Be conservative: if we can't decide, fall back to the old
# behaviour and let the discovery path handle its own errors.
_has_mcp_servers = True
if _has_mcp_servers:
try:
from tools.mcp_tool import discover_mcp_tools
discover_mcp_tools()
except Exception:
pass
if not write_json({
"jsonrpc": "2.0",

View file

@ -465,6 +465,119 @@ def _wait_agent(session: dict, rid: str, timeout: float = 30.0) -> dict | None:
return _err(rid, 5032, err) if err else None
def _start_agent_build(sid: str, session: dict) -> None:
"""Start building the real AIAgent for a TUI session, once.
Classic `hermes` shows the prompt before constructing AIAgent; the TUI used
to eagerly build it during session.create, making startup feel blocked on
tool discovery/model metadata even though the composer was visible. Keep
the shell responsive by deferring this work until the first prompt (or any
command that actually needs the agent), while retaining the same ready/error
event contract for the frontend.
"""
ready = session.get("agent_ready")
if ready is None:
return
lock = session.setdefault("agent_build_lock", threading.Lock())
with lock:
if ready.is_set() or session.get("agent_build_started"):
return
session["agent_build_started"] = True
key = session["session_key"]
def _build() -> None:
current = _sessions.get(sid)
if current is None:
ready.set()
return
worker = None
notify_registered = False
try:
tokens = _set_session_context(key)
try:
agent = _make_agent(sid, key)
finally:
_clear_session_context(tokens)
db = _get_db()
if db is not None:
db.create_session(key, source="tui", model=_resolve_model())
pending_title = (current.get("pending_title") or "").strip()
if pending_title:
try:
title_applied = db.set_session_title(key, pending_title)
if title_applied:
current["pending_title"] = None
else:
existing_row = db.get_session(key)
existing_title = ((existing_row or {}).get("title") or "").strip()
if existing_title == pending_title:
current["pending_title"] = None
else:
logger.info(
"Pending title still queued for session %s (wanted=%r, current=%r)",
sid,
pending_title,
existing_title,
)
except ValueError as e:
current["pending_title"] = None
logger.info("Dropping pending title for session %s: %s", sid, e)
except Exception:
logger.warning("Failed to apply pending title for session %s", sid, exc_info=True)
current["agent"] = agent
try:
worker = _SlashWorker(key, getattr(agent, "model", _resolve_model()))
current["slash_worker"] = worker
except Exception:
pass
try:
from tools.approval import (
register_gateway_notify,
load_permanent_allowlist,
)
register_gateway_notify(key, lambda data: _emit("approval.request", sid, data))
notify_registered = True
load_permanent_allowlist()
except Exception:
pass
_wire_callbacks(sid)
_notify_session_boundary("on_session_reset", key)
info = _session_info(agent)
warn = _probe_credentials(agent)
if warn:
info["credential_warning"] = warn
cfg_warn = _probe_config_health(_load_cfg())
if cfg_warn:
info["config_warning"] = cfg_warn
logger.warning(cfg_warn)
_emit("session.info", sid, info)
except Exception as e:
current["agent_error"] = str(e)
_emit("error", sid, {"message": f"agent init failed: {e}"})
finally:
if _sessions.get(sid) is not current:
if worker is not None:
try:
worker.close()
except Exception:
pass
if notify_registered:
try:
from tools.approval import unregister_gateway_notify
unregister_gateway_notify(key)
except Exception:
pass
ready.set()
threading.Thread(target=_build, daemon=True).start()
def _sess_nowait(params, rid):
s = _sessions.get(params.get("session_id") or "")
return (s, None) if s else (None, _err(rid, 4001, "session not found"))
@ -472,7 +585,10 @@ def _sess_nowait(params, rid):
def _sess(params, rid):
s, err = _sess_nowait(params, rid)
return (None, err) if err else (s, _wait_agent(s, rid))
if err:
return (None, err)
_start_agent_build(params.get("session_id") or "", s)
return (s, _wait_agent(s, rid))
def _normalize_completion_path(path_part: str) -> str:
@ -1627,129 +1743,18 @@ def _(rid, params: dict) -> dict:
"transport": current_transport() or _stdio_transport,
}
def _build() -> None:
# Return the lightweight session immediately so Ink can paint the composer
# + skeleton panel, then build the real AIAgent just after this response is
# flushed. This keeps startup responsive while still hydrating tools/skills
# without requiring the user to submit a first prompt.
def _deferred_build() -> None:
session = _sessions.get(sid)
if session is None:
# session.close ran before the build thread got scheduled.
ready.set()
return
if session is not None:
_start_agent_build(sid, session)
# Track what we allocate so we can clean up if session.close
# races us to the finish line. session.close pops _sessions[sid]
# unconditionally and tries to close the slash_worker it finds;
# if _build is still mid-construction when close runs, close
# finds slash_worker=None / notify unregistered and returns
# cleanly — leaving us, the build thread, to later install the
# worker + notify on an orphaned session dict. The finally
# block below detects the orphan and cleans up instead of
# leaking a subprocess and a global notify registration.
worker = None
notify_registered = False
try:
tokens = _set_session_context(key)
try:
agent = _make_agent(sid, key)
finally:
_clear_session_context(tokens)
db = _get_db()
if db is not None:
db.create_session(key, source="tui", model=_resolve_model())
pending_title = (session.get("pending_title") or "").strip()
if pending_title:
try:
title_applied = db.set_session_title(key, pending_title)
if title_applied:
session["pending_title"] = None
else:
existing_row = db.get_session(key)
existing_title = (
(existing_row or {}).get("title") or ""
).strip()
if existing_title == pending_title:
session["pending_title"] = None
else:
logger.info(
"Pending title still queued for session %s (wanted=%r, current=%r)",
sid,
pending_title,
existing_title,
)
except ValueError as e:
# Queued title can become invalid/duplicate between queue time
# and DB row creation. Drop the queue and log the reason so
# future /title reads don't surface a stuck pending value.
session["pending_title"] = None
logger.info(
"Dropping pending title for session %s: %s",
sid,
e,
)
except Exception:
logger.warning(
"Failed to apply pending title for session %s",
sid,
exc_info=True,
)
session["agent"] = agent
try:
worker = _SlashWorker(key, getattr(agent, "model", _resolve_model()))
session["slash_worker"] = worker
except Exception:
pass
try:
from tools.approval import (
register_gateway_notify,
load_permanent_allowlist,
)
register_gateway_notify(
key, lambda data: _emit("approval.request", sid, data)
)
notify_registered = True
load_permanent_allowlist()
except Exception:
pass
_wire_callbacks(sid)
_notify_session_boundary("on_session_reset", key)
info = _session_info(agent)
warn = _probe_credentials(agent)
if warn:
info["credential_warning"] = warn
cfg_warn = _probe_config_health(_load_cfg())
if cfg_warn:
info["config_warning"] = cfg_warn
logger.warning(cfg_warn)
_emit("session.info", sid, info)
except Exception as e:
session["agent_error"] = str(e)
_emit("error", sid, {"message": f"agent init failed: {e}"})
finally:
# Orphan check: if session.close raced us and popped
# _sessions[sid] while we were building, the dict we just
# populated is unreachable. Clean up the subprocess and
# the global notify registration ourselves — session.close
# couldn't see them at the time it ran.
if _sessions.get(sid) is not session:
if worker is not None:
try:
worker.close()
except Exception:
pass
if notify_registered:
try:
from tools.approval import unregister_gateway_notify
unregister_gateway_notify(key)
except Exception:
pass
ready.set()
threading.Thread(target=_build, daemon=True).start()
build_timer = threading.Timer(0.05, _deferred_build)
build_timer.daemon = True
build_timer.start()
return _ok(
rid,
@ -1760,6 +1765,7 @@ def _(rid, params: dict) -> dict:
"tools": {},
"skills": {},
"cwd": os.getenv("TERMINAL_CWD", os.getcwd()),
"lazy": True,
},
},
)
@ -1901,7 +1907,7 @@ def _(rid, params: dict) -> dict:
@method("session.title")
def _(rid, params: dict) -> dict:
session, err = _sess(params, rid)
session, err = _sess_nowait(params, rid)
if err:
return err
db = _get_db()
@ -1964,13 +1970,16 @@ def _(rid, params: dict) -> dict:
@method("session.usage")
def _(rid, params: dict) -> dict:
session, err = _sess(params, rid)
return err or _ok(rid, _get_usage(session["agent"]))
session, err = _sess_nowait(params, rid)
if err:
return err
agent = session.get("agent")
return _ok(rid, _get_usage(agent) if agent is not None else {"calls": 0, "input": 0, "output": 0, "total": 0})
@method("session.history")
def _(rid, params: dict) -> dict:
session, err = _sess(params, rid)
session, err = _sess_nowait(params, rid)
if err:
return err
history = list(session.get("history", []))
@ -2437,13 +2446,31 @@ def _(rid, params: dict) -> dict:
@method("prompt.submit")
def _(rid, params: dict) -> dict:
sid, text = params.get("session_id", ""), params.get("text", "")
session, err = _sess(params, rid)
session, err = _sess_nowait(params, rid)
if err:
return err
with session["history_lock"]:
if session.get("running"):
return _err(rid, 4009, "session busy")
session["running"] = True
_start_agent_build(sid, session)
def run_after_agent_ready() -> None:
err = _wait_agent(session, rid)
if err:
_emit("error", sid, {"message": err.get("error", {}).get("message", "agent initialization failed")})
with session["history_lock"]:
session["running"] = False
return
_run_prompt_submit(rid, sid, session, text)
threading.Thread(target=run_after_agent_ready, daemon=True).start()
return _ok(rid, {"status": "streaming"})
def _run_prompt_submit(rid, sid: str, session: dict, text: Any) -> None:
with session["history_lock"]:
history = list(session["history"])
history_version = int(session.get("history_version", 0))
images = list(session.get("attached_images", []))
@ -2682,7 +2709,6 @@ def _(rid, params: dict) -> dict:
session["running"] = False
threading.Thread(target=run, daemon=True).start()
return _ok(rid, {"status": "streaming"})
@method("clipboard.paste")

View file

@ -5,8 +5,7 @@ import type { GatewayClient } from '../gatewayClient.js'
import type {
ConfigFullResponse,
ConfigMtimeResponse,
ReloadMcpResponse,
VoiceToggleResponse
ReloadMcpResponse
} from '../gatewayTypes.js'
import { asRpcResult } from '../lib/rpc.js'
@ -118,7 +117,11 @@ export function useConfigSync({ gw, setBellOnComplete, setVoiceEnabled, sid }: U
return
}
quietRpc<VoiceToggleResponse>(gw, 'voice.toggle', { action: 'status' }).then(r => setVoiceEnabled(!!r?.enabled))
// Keep startup cheap: voice.toggle status probes optional audio/STT deps and
// can run long enough to delay prompt.submit on the single stdio RPC pipe.
// Environment flags are enough to initialize the UI bit; the heavier status
// check still runs when the user opens /voice.
setVoiceEnabled(process.env.HERMES_VOICE === '1')
quietRpc<ConfigMtimeResponse>(gw, 'config.get', { key: 'mtime' }).then(r => {
mtimeRef.current = Number(r?.mtime ?? 0)
})

View file

@ -126,6 +126,13 @@ export function useSubmission(opts: UseSubmissionOptions) {
return sys('session not ready yet')
}
// Plain prompts are the common path and should not pay an extra RPC
// before prompt.submit. File-drop detection still runs for absolute,
// tilde, file://, and explicit relative paths.
if (!looksLikeSlashCommand(text) && !/(?:^|\s)(?:file:\/\/|~\/|\.?\.\/|\/)[^\s]+/.test(text)) {
return startSubmit(text, expand(text), showUserMessage)
}
gw.request<InputDetectDropResponse>('input.detect_drop', { session_id: sid, text })
.then(r => {
if (!r?.matched) {

View file

@ -68,7 +68,7 @@ const TranscriptPane = memo(function TranscriptPane({
<Box flexDirection="column" paddingTop={1}>
<Banner t={ui.theme} />
{row.msg.info?.version && <SessionPanel info={row.msg.info} sid={ui.sid} t={ui.theme} />}
{row.msg.info && <SessionPanel info={row.msg.info} sid={ui.sid} t={ui.theme} />}
</Box>
) : row.msg.kind === 'panel' && row.msg.panelData ? (
<Panel sections={row.msg.panelData.sections} t={ui.theme} title={row.msg.panelData.title} />

View file

@ -1,10 +1,32 @@
import { Box, Text, useStdout } from '@hermes/ink'
import { useEffect, useState } from 'react'
import unicodeSpinners from 'unicode-animations'
import { artWidth, caduceus, CADUCEUS_WIDTH, logo, LOGO_WIDTH } from '../banner.js'
import { flat } from '../lib/text.js'
import type { Theme } from '../theme.js'
import type { PanelSection, SessionInfo } from '../types.js'
const LOADER_TICK_MS = 120
function InlineLoader({ label, t }: { label: string; t: Theme }) {
const [tick, setTick] = useState(0)
const spinner = unicodeSpinners.braille
const frame = spinner.frames[tick % spinner.frames.length] ?? '⠋'
useEffect(() => {
const id = setInterval(() => setTick(n => n + 1), Math.max(LOADER_TICK_MS, spinner.interval))
return () => clearInterval(id)
}, [spinner.interval])
return (
<Text color={t.color.muted} wrap="truncate">
<Text color={t.color.accent}>{frame}</Text> {label}
</Text>
)
}
export function ArtLines({ lines }: { lines: [string, string][] }) {
return (
<>
@ -67,6 +89,7 @@ export function SessionPanel({ info, sid, t }: SessionPanelProps) {
const entries = Object.entries(data).sort()
const shown = entries.slice(0, max)
const overflow = entries.length - max
const skeleton = info.lazy && entries.length === 0
return (
<Box flexDirection="column" marginTop={1}>
@ -74,12 +97,16 @@ export function SessionPanel({ info, sid, t }: SessionPanelProps) {
Available {title}
</Text>
{shown.map(([k, vs]) => (
<Text key={k} wrap="truncate">
<Text color={t.color.muted}>{strip(k)}: </Text>
<Text color={t.color.text}>{truncLine(strip(k) + ': ', vs)}</Text>
</Text>
))}
{skeleton ? (
<InlineLoader label={title === 'Tools' ? 'discovering tools' : 'scanning skills'} t={t} />
) : (
shown.map(([k, vs]) => (
<Text key={k} wrap="truncate">
<Text color={t.color.muted}>{strip(k)}: </Text>
<Text color={t.color.text}>{truncLine(strip(k) + ': ', vs)}</Text>
</Text>
))
)}
{overflow > 0 && (
<Text color={t.color.muted}>

View file

@ -1,5 +1,3 @@
import { evictInkCaches } from '@hermes/ink'
import { type HeapDumpResult, performHeapDump } from './memory.js'
export type MemoryLevel = 'critical' | 'high' | 'normal'
@ -20,6 +18,40 @@ export interface MemoryMonitorOptions {
const GB = 1024 ** 3
// Deferred @hermes/ink import: loading `@hermes/ink` at module top-level
// pulls the full ~414KB Ink bundle (React, renderer, components, hooks) onto
// the critical path before the Python gateway can even be spawned. That
// serialised roughly 150ms of Node work in front of gw.start() on every
// cold `hermes --tui` launch.
//
// evictInkCaches only runs inside `tick()`, which fires on a 10s timer and
// only when heap pressure crosses the high-water mark — by then Ink has
// long since been loaded by the app entry. This dynamic import is a no-op
// on the hot path (module is already in the ESM cache); when a startup
// spike somehow trips the threshold before the app registers its own Ink
// import, we pay the load cost exactly once, inside the tick that needs it.
let _evictInkCaches: ((level: 'all' | 'half') => unknown) | null = null
let _evictInkCachesPromise: Promise<(level: 'all' | 'half') => unknown> | null = null
async function _ensureEvictInkCaches(): Promise<(level: 'all' | 'half') => unknown> {
if (_evictInkCaches) {
return _evictInkCaches
}
_evictInkCachesPromise ??= import('@hermes/ink')
.then(mod => {
_evictInkCaches = mod.evictInkCaches as (level: 'all' | 'half') => unknown
return _evictInkCaches
})
.catch(err => {
_evictInkCachesPromise = null
throw err
})
return _evictInkCachesPromise
}
export function startMemoryMonitor({
criticalBytes = 2.5 * GB,
highBytes = 1.5 * GB,
@ -28,29 +60,45 @@ export function startMemoryMonitor({
onHigh
}: MemoryMonitorOptions = {}): () => void {
const dumped = new Set<Exclude<MemoryLevel, 'normal'>>()
const inFlight = new Set<Exclude<MemoryLevel, 'normal'>>()
const tick = async () => {
const { heapUsed, rss } = process.memoryUsage()
const level: MemoryLevel = heapUsed >= criticalBytes ? 'critical' : heapUsed >= highBytes ? 'high' : 'normal'
if (level === 'normal') {
return void dumped.clear()
}
if (dumped.has(level)) {
dumped.clear()
return
}
if (dumped.has(level) || inFlight.has(level)) {
return
}
inFlight.add(level)
// Prune Ink content caches before dump/exit — half on 'high' (recoverable),
// full on 'critical' (post-dump RSS reduction, keeps user running).
evictInkCaches(level === 'critical' ? 'all' : 'half')
// Deferred import keeps `@hermes/ink` off the cold-start critical path;
// by the time a tick fires 10s after launch the app has already loaded
// the same module, so this resolves instantly from the ESM cache.
try {
try {
const evictInkCaches = await _ensureEvictInkCaches()
evictInkCaches(level === 'critical' ? 'all' : 'half')
} catch {
// Best-effort: if the dynamic import fails for any reason we still
// continue to the heap dump below so the user gets diagnostics.
}
dumped.add(level)
const dump = await performHeapDump(level === 'critical' ? 'auto-critical' : 'auto-high').catch(() => null)
dumped.add(level)
const dump = await performHeapDump(level === 'critical' ? 'auto-critical' : 'auto-high').catch(() => null)
const snap: MemorySnapshot = { heapUsed, level, rss }
const snap: MemorySnapshot = { heapUsed, level, rss }
;(level === 'critical' ? onCritical : onHigh)?.(snap, dump)
;(level === 'critical' ? onCritical : onHigh)?.(snap, dump)
} finally {
inFlight.delete(level)
}
}
const handle = setInterval(() => void tick(), intervalMs)

View file

@ -143,11 +143,12 @@ export interface McpServerStatus {
export interface SessionInfo {
cwd?: string
fast?: boolean
lazy?: boolean
mcp_servers?: McpServerStatus[]
model: string
reasoning_effort?: string
service_tier?: string
release_date?: string
service_tier?: string
skills: Record<string, string[]>
tools: Record<string, string[]>
update_behind?: number | null