Merge pull request #17190 from NousResearch/bb/tui-cold-start-profiling

perf(tui): cut visible cold start ~57% with lazy agent init
2026-05-07 02:51:50 +00:00 · 2026-04-28 22:45:14 -07:00 · 2026-04-28 22:45:14 -07:00 · 5e68503d2f
commit 5e68503d2f
parent fa9383d27b 22cc7492ff
10 changed files with 309 additions and 161 deletions
--- a/tests/tools/test_code_execution.py
+++ b/tests/tools/test_code_execution.py
@ -770,11 +770,19 @@ class TestLoadConfig(unittest.TestCase):
    def test_returns_code_execution_section(self):
        from tools.code_execution_tool import _load_config
-        mock_cli = MagicMock()
+        with patch("hermes_cli.config.read_raw_config",
-        mock_cli.CLI_CONFIG = {"code_execution": {"timeout": 120, "max_tool_calls": 10}}
+                   return_value={"code_execution": {"timeout": 120, "max_tool_calls": 10}}):
        with patch.dict("sys.modules", {"cli": mock_cli}):
            result = _load_config()
-        self.assertIsInstance(result, dict)
+        self.assertEqual(result, {"timeout": 120, "max_tool_calls": 10})
    def test_does_not_import_interactive_cli(self):
        from tools.code_execution_tool import _load_config
        mock_cli = MagicMock()
        mock_cli.CLI_CONFIG = {"code_execution": {"timeout": 999}}
        with patch.dict("sys.modules", {"cli": mock_cli}), \
             patch("hermes_cli.config.read_raw_config", return_value={}):
            result = _load_config()
        self.assertEqual(result, {})
 # ---------------------------------------------------------------------------
--- a/tools/code_execution_tool.py
+++ b/tools/code_execution_tool.py
@ -1309,10 +1309,20 @@ def _kill_process_group(proc, escalate: bool = False):
 def _load_config() -> dict:
-    """Load code_execution config from CLI_CONFIG if available."""
+    """Load code_execution config without importing the interactive CLI.
    This helper is called while building the module-level execute_code schema
    during tool discovery.  Importing ``cli`` here pulls prompt_toolkit/Rich and
    a large chunk of the classic REPL onto every agent startup path, including
    ``hermes --tui`` where it is never used.  Read the lightweight raw config
    instead; the config layer already caches by (mtime, size), and an absent
    key cleanly falls back to DEFAULT_EXECUTION_MODE.
    """
    try:
-        from cli import CLI_CONFIG
+        from hermes_cli.config import read_raw_config
-        return CLI_CONFIG.get("code_execution", {})
+
        cfg = read_raw_config().get("code_execution", {})
        return cfg if isinstance(cfg, dict) else {}
    except Exception:
        return {}
--- a/tui_gateway/entry.py
+++ b/tui_gateway/entry.py
@ -165,11 +165,29 @@ def main():
    # a model_tools.py module-level side effect; moved to explicit
    # startup calls to avoid freezing the gateway's loop on lazy import
    # (#16856).
    #
    # Cold-start guard: importing ``tools.mcp_tool`` transitively pulls the
    # full MCP SDK (mcp, pydantic, httpx, jsonschema, starlette parsers —
    # ~200ms on macOS), which runs on the TUI's critical path before
    # ``gateway.ready`` can be emitted.  The overwhelming majority of users
    # have no ``mcp_servers`` configured, in which case every byte of that
    # import is wasted.  Check the config first (cheap — it's already been
    # loaded once by ``_config_mtime`` elsewhere) and only pay the import
    # cost when there's actually MCP work to do.
    try:
-        from tools.mcp_tool import discover_mcp_tools
+        from hermes_cli.config import read_raw_config
-        discover_mcp_tools()
+        _mcp_servers = (read_raw_config() or {}).get("mcp_servers")
        _has_mcp_servers = isinstance(_mcp_servers, dict) and len(_mcp_servers) > 0
    except Exception:
-        pass
+        # Be conservative: if we can't decide, fall back to the old
        # behaviour and let the discovery path handle its own errors.
        _has_mcp_servers = True
    if _has_mcp_servers:
        try:
            from tools.mcp_tool import discover_mcp_tools
            discover_mcp_tools()
        except Exception:
            pass
    if not write_json({
        "jsonrpc": "2.0",
--- a/tui_gateway/server.py
+++ b/tui_gateway/server.py
@ -465,6 +465,119 @@ def _wait_agent(session: dict, rid: str, timeout: float = 30.0) -> dict | None:
    return _err(rid, 5032, err) if err else None
 def _start_agent_build(sid: str, session: dict) -> None:
    """Start building the real AIAgent for a TUI session, once.
    Classic `hermes` shows the prompt before constructing AIAgent; the TUI used
    to eagerly build it during session.create, making startup feel blocked on
    tool discovery/model metadata even though the composer was visible.  Keep
    the shell responsive by deferring this work until the first prompt (or any
    command that actually needs the agent), while retaining the same ready/error
    event contract for the frontend.
    """
    ready = session.get("agent_ready")
    if ready is None:
        return
    lock = session.setdefault("agent_build_lock", threading.Lock())
    with lock:
        if ready.is_set() or session.get("agent_build_started"):
            return
        session["agent_build_started"] = True
    key = session["session_key"]
    def _build() -> None:
        current = _sessions.get(sid)
        if current is None:
            ready.set()
            return
        worker = None
        notify_registered = False
        try:
            tokens = _set_session_context(key)
            try:
                agent = _make_agent(sid, key)
            finally:
                _clear_session_context(tokens)
            db = _get_db()
            if db is not None:
                db.create_session(key, source="tui", model=_resolve_model())
                pending_title = (current.get("pending_title") or "").strip()
                if pending_title:
                    try:
                        title_applied = db.set_session_title(key, pending_title)
                        if title_applied:
                            current["pending_title"] = None
                        else:
                            existing_row = db.get_session(key)
                            existing_title = ((existing_row or {}).get("title") or "").strip()
                            if existing_title == pending_title:
                                current["pending_title"] = None
                            else:
                                logger.info(
                                    "Pending title still queued for session %s (wanted=%r, current=%r)",
                                    sid,
                                    pending_title,
                                    existing_title,
                                )
                    except ValueError as e:
                        current["pending_title"] = None
                        logger.info("Dropping pending title for session %s: %s", sid, e)
                    except Exception:
                        logger.warning("Failed to apply pending title for session %s", sid, exc_info=True)
            current["agent"] = agent
            try:
                worker = _SlashWorker(key, getattr(agent, "model", _resolve_model()))
                current["slash_worker"] = worker
            except Exception:
                pass
            try:
                from tools.approval import (
                    register_gateway_notify,
                    load_permanent_allowlist,
                )
                register_gateway_notify(key, lambda data: _emit("approval.request", sid, data))
                notify_registered = True
                load_permanent_allowlist()
            except Exception:
                pass
            _wire_callbacks(sid)
            _notify_session_boundary("on_session_reset", key)
            info = _session_info(agent)
            warn = _probe_credentials(agent)
            if warn:
                info["credential_warning"] = warn
            cfg_warn = _probe_config_health(_load_cfg())
            if cfg_warn:
                info["config_warning"] = cfg_warn
                logger.warning(cfg_warn)
            _emit("session.info", sid, info)
        except Exception as e:
            current["agent_error"] = str(e)
            _emit("error", sid, {"message": f"agent init failed: {e}"})
        finally:
            if _sessions.get(sid) is not current:
                if worker is not None:
                    try:
                        worker.close()
                    except Exception:
                        pass
                if notify_registered:
                    try:
                        from tools.approval import unregister_gateway_notify
                        unregister_gateway_notify(key)
                    except Exception:
                        pass
            ready.set()
    threading.Thread(target=_build, daemon=True).start()
 def _sess_nowait(params, rid):
    s = _sessions.get(params.get("session_id") or "")
    return (s, None) if s else (None, _err(rid, 4001, "session not found"))
@ -472,7 +585,10 @@ def _sess_nowait(params, rid):
 def _sess(params, rid):
    s, err = _sess_nowait(params, rid)
-    return (None, err) if err else (s, _wait_agent(s, rid))
+    if err:
        return (None, err)
    _start_agent_build(params.get("session_id") or "", s)
    return (s, _wait_agent(s, rid))
 def _normalize_completion_path(path_part: str) -> str:
@ -1627,129 +1743,18 @@ def _(rid, params: dict) -> dict:
        "transport": current_transport() or _stdio_transport,
    }
-    def _build() -> None:
+    # Return the lightweight session immediately so Ink can paint the composer
    # + skeleton panel, then build the real AIAgent just after this response is
    # flushed.  This keeps startup responsive while still hydrating tools/skills
    # without requiring the user to submit a first prompt.
    def _deferred_build() -> None:
        session = _sessions.get(sid)
-        if session is None:
+        if session is not None:
-            # session.close ran before the build thread got scheduled.
+            _start_agent_build(sid, session)
            ready.set()
            return
-        # Track what we allocate so we can clean up if session.close
+    build_timer = threading.Timer(0.05, _deferred_build)
-        # races us to the finish line.  session.close pops _sessions[sid]
+    build_timer.daemon = True
-        # unconditionally and tries to close the slash_worker it finds;
+    build_timer.start()
        # if _build is still mid-construction when close runs, close
        # finds slash_worker=None / notify unregistered and returns
        # cleanly — leaving us, the build thread, to later install the
        # worker + notify on an orphaned session dict.  The finally
        # block below detects the orphan and cleans up instead of
        # leaking a subprocess and a global notify registration.
        worker = None
        notify_registered = False
        try:
            tokens = _set_session_context(key)
            try:
                agent = _make_agent(sid, key)
            finally:
                _clear_session_context(tokens)
            db = _get_db()
            if db is not None:
                db.create_session(key, source="tui", model=_resolve_model())
                pending_title = (session.get("pending_title") or "").strip()
                if pending_title:
                    try:
                        title_applied = db.set_session_title(key, pending_title)
                        if title_applied:
                            session["pending_title"] = None
                        else:
                            existing_row = db.get_session(key)
                            existing_title = (
                                (existing_row or {}).get("title") or ""
                            ).strip()
                            if existing_title == pending_title:
                                session["pending_title"] = None
                            else:
                                logger.info(
                                    "Pending title still queued for session %s (wanted=%r, current=%r)",
                                    sid,
                                    pending_title,
                                    existing_title,
                                )
                    except ValueError as e:
                        # Queued title can become invalid/duplicate between queue time
                        # and DB row creation. Drop the queue and log the reason so
                        # future /title reads don't surface a stuck pending value.
                        session["pending_title"] = None
                        logger.info(
                            "Dropping pending title for session %s: %s",
                            sid,
                            e,
                        )
                    except Exception:
                        logger.warning(
                            "Failed to apply pending title for session %s",
                            sid,
                            exc_info=True,
                        )
            session["agent"] = agent
            try:
                worker = _SlashWorker(key, getattr(agent, "model", _resolve_model()))
                session["slash_worker"] = worker
            except Exception:
                pass
            try:
                from tools.approval import (
                    register_gateway_notify,
                    load_permanent_allowlist,
                )
                register_gateway_notify(
                    key, lambda data: _emit("approval.request", sid, data)
                )
                notify_registered = True
                load_permanent_allowlist()
            except Exception:
                pass
            _wire_callbacks(sid)
            _notify_session_boundary("on_session_reset", key)
            info = _session_info(agent)
            warn = _probe_credentials(agent)
            if warn:
                info["credential_warning"] = warn
            cfg_warn = _probe_config_health(_load_cfg())
            if cfg_warn:
                info["config_warning"] = cfg_warn
                logger.warning(cfg_warn)
            _emit("session.info", sid, info)
        except Exception as e:
            session["agent_error"] = str(e)
            _emit("error", sid, {"message": f"agent init failed: {e}"})
        finally:
            # Orphan check: if session.close raced us and popped
            # _sessions[sid] while we were building, the dict we just
            # populated is unreachable.  Clean up the subprocess and
            # the global notify registration ourselves — session.close
            # couldn't see them at the time it ran.
            if _sessions.get(sid) is not session:
                if worker is not None:
                    try:
                        worker.close()
                    except Exception:
                        pass
                if notify_registered:
                    try:
                        from tools.approval import unregister_gateway_notify
                        unregister_gateway_notify(key)
                    except Exception:
                        pass
            ready.set()
    threading.Thread(target=_build, daemon=True).start()
    return _ok(
        rid,
@ -1760,6 +1765,7 @@ def _(rid, params: dict) -> dict:
                "tools": {},
                "skills": {},
                "cwd": os.getenv("TERMINAL_CWD", os.getcwd()),
                "lazy": True,
            },
        },
    )
@ -1901,7 +1907,7 @@ def _(rid, params: dict) -> dict:
@method("session.title")
 def _(rid, params: dict) -> dict:
-    session, err = _sess(params, rid)
+    session, err = _sess_nowait(params, rid)
    if err:
        return err
    db = _get_db()
@ -1964,13 +1970,16 @@ def _(rid, params: dict) -> dict:
@method("session.usage")
 def _(rid, params: dict) -> dict:
-    session, err = _sess(params, rid)
+    session, err = _sess_nowait(params, rid)
-    return err or _ok(rid, _get_usage(session["agent"]))
+    if err:
        return err
    agent = session.get("agent")
    return _ok(rid, _get_usage(agent) if agent is not None else {"calls": 0, "input": 0, "output": 0, "total": 0})
@method("session.history")
 def _(rid, params: dict) -> dict:
-    session, err = _sess(params, rid)
+    session, err = _sess_nowait(params, rid)
    if err:
        return err
    history = list(session.get("history", []))
@ -2437,13 +2446,31 @@ def _(rid, params: dict) -> dict:
@method("prompt.submit")
 def _(rid, params: dict) -> dict:
    sid, text = params.get("session_id", ""), params.get("text", "")
-    session, err = _sess(params, rid)
+    session, err = _sess_nowait(params, rid)
    if err:
        return err
    with session["history_lock"]:
        if session.get("running"):
            return _err(rid, 4009, "session busy")
        session["running"] = True
    _start_agent_build(sid, session)
    def run_after_agent_ready() -> None:
        err = _wait_agent(session, rid)
        if err:
            _emit("error", sid, {"message": err.get("error", {}).get("message", "agent initialization failed")})
            with session["history_lock"]:
                session["running"] = False
            return
        _run_prompt_submit(rid, sid, session, text)
    threading.Thread(target=run_after_agent_ready, daemon=True).start()
    return _ok(rid, {"status": "streaming"})
 def _run_prompt_submit(rid, sid: str, session: dict, text: Any) -> None:
    with session["history_lock"]:
        history = list(session["history"])
        history_version = int(session.get("history_version", 0))
        images = list(session.get("attached_images", []))
@ -2682,7 +2709,6 @@ def _(rid, params: dict) -> dict:
                session["running"] = False
    threading.Thread(target=run, daemon=True).start()
    return _ok(rid, {"status": "streaming"})
@method("clipboard.paste")
--- a/ui-tui/src/app/useConfigSync.ts
+++ b/ui-tui/src/app/useConfigSync.ts
@ -5,8 +5,7 @@ import type { GatewayClient } from '../gatewayClient.js'
 import type {
  ConfigFullResponse,
  ConfigMtimeResponse,
-  ReloadMcpResponse,
+  ReloadMcpResponse
  VoiceToggleResponse
 } from '../gatewayTypes.js'
 import { asRpcResult } from '../lib/rpc.js'
@ -118,7 +117,11 @@ export function useConfigSync({ gw, setBellOnComplete, setVoiceEnabled, sid }: U
      return
    }
-    quietRpc<VoiceToggleResponse>(gw, 'voice.toggle', { action: 'status' }).then(r => setVoiceEnabled(!!r?.enabled))
+    // Keep startup cheap: voice.toggle status probes optional audio/STT deps and
    // can run long enough to delay prompt.submit on the single stdio RPC pipe.
    // Environment flags are enough to initialize the UI bit; the heavier status
    // check still runs when the user opens /voice.
    setVoiceEnabled(process.env.HERMES_VOICE === '1')
    quietRpc<ConfigMtimeResponse>(gw, 'config.get', { key: 'mtime' }).then(r => {
      mtimeRef.current = Number(r?.mtime ?? 0)
    })
--- a/ui-tui/src/app/useSubmission.ts
+++ b/ui-tui/src/app/useSubmission.ts
@ -126,6 +126,13 @@ export function useSubmission(opts: UseSubmissionOptions) {
        return sys('session not ready yet')
      }
      // Plain prompts are the common path and should not pay an extra RPC
      // before prompt.submit. File-drop detection still runs for absolute,
      // tilde, file://, and explicit relative paths.
      if (!looksLikeSlashCommand(text) && !/(?:^|\s)(?:file:\/\/|~\/|\.?\.\/|\/)[^\s]+/.test(text)) {
        return startSubmit(text, expand(text), showUserMessage)
      }
      gw.request<InputDetectDropResponse>('input.detect_drop', { session_id: sid, text })
        .then(r => {
          if (!r?.matched) {
--- a/ui-tui/src/components/appLayout.tsx
+++ b/ui-tui/src/components/appLayout.tsx
@ -68,7 +68,7 @@ const TranscriptPane = memo(function TranscriptPane({
                <Box flexDirection="column" paddingTop={1}>
                  <Banner t={ui.theme} />
-                  {row.msg.info?.version && <SessionPanel info={row.msg.info} sid={ui.sid} t={ui.theme} />}
+                  {row.msg.info && <SessionPanel info={row.msg.info} sid={ui.sid} t={ui.theme} />}
                </Box>
              ) : row.msg.kind === 'panel' && row.msg.panelData ? (
                <Panel sections={row.msg.panelData.sections} t={ui.theme} title={row.msg.panelData.title} />
--- a/ui-tui/src/components/branding.tsx
+++ b/ui-tui/src/components/branding.tsx
@ -1,10 +1,32 @@
 import { Box, Text, useStdout } from '@hermes/ink'
 import { useEffect, useState } from 'react'
 import unicodeSpinners from 'unicode-animations'
 import { artWidth, caduceus, CADUCEUS_WIDTH, logo, LOGO_WIDTH } from '../banner.js'
 import { flat } from '../lib/text.js'
 import type { Theme } from '../theme.js'
 import type { PanelSection, SessionInfo } from '../types.js'
 const LOADER_TICK_MS = 120
 function InlineLoader({ label, t }: { label: string; t: Theme }) {
  const [tick, setTick] = useState(0)
  const spinner = unicodeSpinners.braille
  const frame = spinner.frames[tick % spinner.frames.length] ?? '⠋'
  useEffect(() => {
    const id = setInterval(() => setTick(n => n + 1), Math.max(LOADER_TICK_MS, spinner.interval))
    return () => clearInterval(id)
  }, [spinner.interval])
  return (
    <Text color={t.color.muted} wrap="truncate">
      <Text color={t.color.accent}>{frame}</Text> {label}
    </Text>
  )
 }
 export function ArtLines({ lines }: { lines: [string, string][] }) {
  return (
    <>
@ -67,6 +89,7 @@ export function SessionPanel({ info, sid, t }: SessionPanelProps) {
    const entries = Object.entries(data).sort()
    const shown = entries.slice(0, max)
    const overflow = entries.length - max
    const skeleton = info.lazy && entries.length === 0
    return (
      <Box flexDirection="column" marginTop={1}>
@ -74,12 +97,16 @@ export function SessionPanel({ info, sid, t }: SessionPanelProps) {
          Available {title}
        </Text>
-        {shown.map(([k, vs]) => (
+        {skeleton ? (
-          <Text key={k} wrap="truncate">
+          <InlineLoader label={title === 'Tools' ? 'discovering tools' : 'scanning skills'} t={t} />
-            <Text color={t.color.muted}>{strip(k)}: </Text>
+        ) : (
-            <Text color={t.color.text}>{truncLine(strip(k) + ': ', vs)}</Text>
+          shown.map(([k, vs]) => (
-          </Text>
+            <Text key={k} wrap="truncate">
-        ))}
+              <Text color={t.color.muted}>{strip(k)}: </Text>
              <Text color={t.color.text}>{truncLine(strip(k) + ': ', vs)}</Text>
            </Text>
          ))
        )}
        {overflow > 0 && (
          <Text color={t.color.muted}>
--- a/ui-tui/src/lib/memoryMonitor.ts
+++ b/ui-tui/src/lib/memoryMonitor.ts
@ -1,5 +1,3 @@
 import { evictInkCaches } from '@hermes/ink'
 import { type HeapDumpResult, performHeapDump } from './memory.js'
 export type MemoryLevel = 'critical' | 'high' | 'normal'
@ -20,6 +18,40 @@ export interface MemoryMonitorOptions {
 const GB = 1024 ** 3
 // Deferred @hermes/ink import: loading `@hermes/ink` at module top-level
 // pulls the full ~414KB Ink bundle (React, renderer, components, hooks) onto
 // the critical path before the Python gateway can even be spawned. That
 // serialised roughly 150ms of Node work in front of gw.start() on every
 // cold `hermes --tui` launch.
 //
 // evictInkCaches only runs inside `tick()`, which fires on a 10s timer and
 // only when heap pressure crosses the high-water mark — by then Ink has
 // long since been loaded by the app entry. This dynamic import is a no-op
 // on the hot path (module is already in the ESM cache); when a startup
 // spike somehow trips the threshold before the app registers its own Ink
 // import, we pay the load cost exactly once, inside the tick that needs it.
 let _evictInkCaches: ((level: 'all' | 'half') => unknown) | null = null
 let _evictInkCachesPromise: Promise<(level: 'all' | 'half') => unknown> | null = null
 async function _ensureEvictInkCaches(): Promise<(level: 'all' | 'half') => unknown> {
  if (_evictInkCaches) {
    return _evictInkCaches
  }
  _evictInkCachesPromise ??= import('@hermes/ink')
    .then(mod => {
      _evictInkCaches = mod.evictInkCaches as (level: 'all' | 'half') => unknown
      return _evictInkCaches
    })
    .catch(err => {
      _evictInkCachesPromise = null
      throw err
    })
  return _evictInkCachesPromise
 }
 export function startMemoryMonitor({
  criticalBytes = 2.5 * GB,
  highBytes = 1.5 * GB,
@ -28,29 +60,45 @@ export function startMemoryMonitor({
  onHigh
 }: MemoryMonitorOptions = {}): () => void {
  const dumped = new Set<Exclude<MemoryLevel, 'normal'>>()
  const inFlight = new Set<Exclude<MemoryLevel, 'normal'>>()
  const tick = async () => {
    const { heapUsed, rss } = process.memoryUsage()
    const level: MemoryLevel = heapUsed >= criticalBytes ? 'critical' : heapUsed >= highBytes ? 'high' : 'normal'
    if (level === 'normal') {
-      return void dumped.clear()
+      dumped.clear()
    }
    if (dumped.has(level)) {
      return
    }
    if (dumped.has(level) || inFlight.has(level)) {
      return
    }
    inFlight.add(level)
    // Prune Ink content caches before dump/exit — half on 'high' (recoverable),
    // full on 'critical' (post-dump RSS reduction, keeps user running).
-    evictInkCaches(level === 'critical' ? 'all' : 'half')
+    // Deferred import keeps `@hermes/ink` off the cold-start critical path;
    // by the time a tick fires 10s after launch the app has already loaded
    // the same module, so this resolves instantly from the ESM cache.
    try {
      try {
        const evictInkCaches = await _ensureEvictInkCaches()
        evictInkCaches(level === 'critical' ? 'all' : 'half')
      } catch {
        // Best-effort: if the dynamic import fails for any reason we still
        // continue to the heap dump below so the user gets diagnostics.
      }
-    dumped.add(level)
+      dumped.add(level)
-    const dump = await performHeapDump(level === 'critical' ? 'auto-critical' : 'auto-high').catch(() => null)
+      const dump = await performHeapDump(level === 'critical' ? 'auto-critical' : 'auto-high').catch(() => null)
      const snap: MemorySnapshot = { heapUsed, level, rss }
-    const snap: MemorySnapshot = { heapUsed, level, rss }
+      ;(level === 'critical' ? onCritical : onHigh)?.(snap, dump)
-
+    } finally {
-    ;(level === 'critical' ? onCritical : onHigh)?.(snap, dump)
+      inFlight.delete(level)
    }
  }
  const handle = setInterval(() => void tick(), intervalMs)
--- a/ui-tui/src/types.ts
+++ b/ui-tui/src/types.ts
@ -143,11 +143,12 @@ export interface McpServerStatus {
 export interface SessionInfo {
  cwd?: string
  fast?: boolean
  lazy?: boolean
  mcp_servers?: McpServerStatus[]
  model: string
  reasoning_effort?: string
  service_tier?: string
  release_date?: string
  service_tier?: string
  skills: Record<string, string[]>
  tools: Record<string, string[]>
  update_behind?: number | null