mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(tui-gateway): dispatch slow RPC handlers on a thread pool (#12546)
The stdin-read loop in entry.py calls handle_request() inline, so the five handlers that can block for seconds to minutes (slash.exec, cli.exec, shell.exec, session.resume, session.branch) freeze the dispatcher. While one is running, any inbound RPC — notably approval.respond and session.interrupt — sits unread in the pipe buffer and lands only after the slow handler returns. Route only those five onto a small ThreadPoolExecutor; every other handler stays on the main thread so the fast-path ordering is unchanged and the audit surface stays small. write_json is already _stdout_lock-guarded, so concurrent response writes are safe. Pool size defaults to 4 (overridable via HERMES_TUI_RPC_POOL_WORKERS). - add _LONG_HANDLERS set + ThreadPoolExecutor + atexit shutdown - new dispatch(req) function: pool for long handlers, inline for rest - _run_and_emit wraps pool work in a try/except so a misbehaving handler still surfaces as a JSON-RPC error instead of silently dying in a worker - entry.py swaps handle_request → dispatch - 5 new tests: sync path still inline, long handlers emit via stdout, fast handler not blocked behind slow one, handler exceptions map to error responses, non-long methods always take the sync path Manual repro confirms the fix: shell.exec(sleep 3) + terminal.resize sent back-to-back now returns the resize response at t=0s while the sleep finishes independently at t=3s. Before, both landed together at t=3s. Fixes #12546.
This commit is contained in:
parent
c567adb58a
commit
a6fe5d0872
3 changed files with 132 additions and 2 deletions
|
|
@ -1,4 +1,5 @@
|
|||
import atexit
|
||||
import concurrent.futures
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
|
|
@ -36,6 +37,29 @@ _cfg_cache: dict | None = None
|
|||
_cfg_mtime: float | None = None
|
||||
_SLASH_WORKER_TIMEOUT_S = max(5.0, float(os.environ.get("HERMES_TUI_SLASH_TIMEOUT_S", "45") or 45))
|
||||
|
||||
# ── Async RPC dispatch (#12546) ──────────────────────────────────────
|
||||
# A handful of handlers block the dispatcher loop in entry.py for seconds
|
||||
# to minutes (slash.exec, cli.exec, shell.exec, session.resume,
|
||||
# session.branch). While they're running, inbound RPCs — notably
|
||||
# approval.respond and session.interrupt — sit unread in the stdin pipe.
|
||||
# We route only those slow handlers onto a small thread pool; everything
|
||||
# else stays on the main thread so ordering stays sane for the fast path.
|
||||
# write_json is already _stdout_lock-guarded, so concurrent response
|
||||
# writes are safe.
|
||||
_LONG_HANDLERS = frozenset({
|
||||
"cli.exec",
|
||||
"session.branch",
|
||||
"session.resume",
|
||||
"shell.exec",
|
||||
"slash.exec",
|
||||
})
|
||||
_RPC_POOL_WORKERS = max(2, int(os.environ.get("HERMES_TUI_RPC_POOL_WORKERS", "4") or 4))
|
||||
_pool = concurrent.futures.ThreadPoolExecutor(
|
||||
max_workers=_RPC_POOL_WORKERS,
|
||||
thread_name_prefix="tui-rpc",
|
||||
)
|
||||
atexit.register(lambda: _pool.shutdown(wait=False, cancel_futures=True))
|
||||
|
||||
# Reserve real stdout for JSON-RPC only; redirect Python's stdout to stderr
|
||||
# so stray print() from libraries/tools becomes harmless gateway.stderr instead
|
||||
# of corrupting the JSON protocol.
|
||||
|
|
@ -200,6 +224,33 @@ def handle_request(req: dict) -> dict | None:
|
|||
return fn(req.get("id"), req.get("params", {}))
|
||||
|
||||
|
||||
def _run_and_emit(req: dict) -> None:
|
||||
"""Run a handler on the RPC pool and write its response directly.
|
||||
|
||||
Catches any unexpected exception so a misbehaving handler can't kill
|
||||
the worker thread silently — the caller still sees a JSON-RPC error.
|
||||
"""
|
||||
try:
|
||||
resp = handle_request(req)
|
||||
except Exception as exc:
|
||||
resp = _err(req.get("id"), -32000, f"handler error: {exc}")
|
||||
if resp is not None:
|
||||
write_json(resp)
|
||||
|
||||
|
||||
def dispatch(req: dict) -> dict | None:
|
||||
"""Route an inbound RPC — long handlers to the pool, everything else inline.
|
||||
|
||||
Returns the response for sync-dispatched requests so the caller
|
||||
(entry.py) can write it. Returns None when the request has been
|
||||
scheduled on the pool; the worker writes the response itself.
|
||||
"""
|
||||
if req.get("method", "") in _LONG_HANDLERS:
|
||||
_pool.submit(_run_and_emit, req)
|
||||
return None
|
||||
return handle_request(req)
|
||||
|
||||
|
||||
def _wait_agent(session: dict, rid: str, timeout: float = 30.0) -> dict | None:
|
||||
ready = session.get("agent_ready")
|
||||
if ready is not None and not ready.wait(timeout=timeout):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue