From 475e81dab4d8cd551df332fc4b56ed39ebfac2f7 Mon Sep 17 00:00:00 2001 From: joaomarcos Date: Sun, 21 Jun 2026 15:30:42 -0300 Subject: [PATCH] fix(web_server): use run_in_executor for gateway pre-warm and drain-timeout Fixes a regression introduced by the prior approach (synchronous import hermes_cli.gateway inside _lifespan) that caused a new failure mode: the blocking import stalled the asyncio event loop before uvicorn could bind its port, pushing HERMES_DASHBOARD_READY past the desktop shell's 45 s announcement deadline and triggering a respawn loop that accumulated orphaned backend processes. Two-part fix: _lifespan: replace the blocking import with a fire-and-forget run_in_executor call (_warm_gateway_module). The import runs in a worker thread while the server socket is already open, so HERMES_DASHBOARD_READY fires without delay. get_status: replace the inline lazy import with await run_in_executor(None, _resolve_restart_drain_timeout). This is the root fix for the original 15 s socket-timeout: the blocking .pyc-compilation + Defender scan is offloaded to a thread, keeping the event loop free for every /api/status probe. After the first call the module is in sys.modules and the executor returns in microseconds. Both helpers are extracted as module-level sync functions so they can be unit-tested independently of FastAPI or uvicorn. Closes #50209 Co-Authored-By: Claude Sonnet 4.6 --- hermes_cli/web_server.py | 46 ++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/hermes_cli/web_server.py b/hermes_cli/web_server.py index 74ea8182533..3049bb45f99 100644 --- a/hermes_cli/web_server.py +++ b/hermes_cli/web_server.py @@ -144,6 +144,22 @@ def _start_desktop_cron_ticker(stop_event: "threading.Event", interval: int = 60 provider.start(stop_event, interval=interval) +def _warm_gateway_module() -> None: + try: + import hermes_cli.gateway # noqa: F401 + except Exception: + pass + + +def _resolve_restart_drain_timeout() -> float: + try: + from hermes_cli.gateway import _get_restart_drain_timeout + return _get_restart_drain_timeout() + except ImportError: + from gateway.restart import DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT + return DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT + + @asynccontextmanager async def _lifespan(app: "FastAPI"): app.state.event_channels = {} # dict[str, set] @@ -154,6 +170,14 @@ async def _lifespan(app: "FastAPI"): # event loop during lifespan startup — see _get_event_state's docstring. app.state.chat_argv_lock = asyncio.Lock() + # Fire hermes_cli.gateway import into a background thread so the event + # loop is not blocked and HERMES_DASHBOARD_READY fires without delay. + # On a cold Windows install the module chain triggers .pyc compilation + # and Defender real-time scans that can stall the event loop for 15-30s. + # Running in an executor means the cost is paid in a worker thread while + # the server socket is already open and accepting probes. + asyncio.get_event_loop().run_in_executor(None, _warm_gateway_module) + # Desktop-spawned backends (HERMES_DESKTOP=1) fire cron jobs themselves, # since the app has no gateway running the scheduler. Server `hermes # dashboard` is unaffected — it relies on its own gateway. @@ -1855,19 +1879,15 @@ async def get_status(profile: Optional[str] = None): gateway_state=gateway_state, ) # Resolved drain timeout (seconds) so NAS can size its poll deadline - # without out-of-band knowledge. Reuse the single resolver - # (HERMES_RESTART_DRAIN_TIMEOUT env → config agent.restart_drain_timeout - # → default) rather than re-deriving the precedence chain here. - try: - from hermes_cli.gateway import _get_restart_drain_timeout - - restart_drain_timeout = _get_restart_drain_timeout() - except ImportError: - # Resolver moved/renamed — fall back to the real default so the - # field stays a numeric poll-deadline hint, never None. - from gateway.restart import DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT - - restart_drain_timeout = DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT + # without out-of-band knowledge. Offload to a thread: on a cold + # Windows install the first import of hermes_cli.gateway blocks the + # asyncio event loop for 15-30s (.pyc compilation + Defender scans), + # exceeding the desktop handshake's 15s socket timeout. After the + # first call the module is in sys.modules and run_in_executor returns + # in microseconds. + restart_drain_timeout = await asyncio.get_running_loop().run_in_executor( + None, _resolve_restart_drain_timeout + ) # Dashboard auth gate (Phase 7): surface whether the gate is engaged # and which providers are registered so ``hermes status`` and the