mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-23 10:42:00 +00:00
fix(web_server): use run_in_executor for gateway pre-warm and drain-timeout
Fixes a regression introduced by the prior approach (synchronous import hermes_cli.gateway inside _lifespan) that caused a new failure mode: the blocking import stalled the asyncio event loop before uvicorn could bind its port, pushing HERMES_DASHBOARD_READY past the desktop shell's 45 s announcement deadline and triggering a respawn loop that accumulated orphaned backend processes. Two-part fix: _lifespan: replace the blocking import with a fire-and-forget run_in_executor call (_warm_gateway_module). The import runs in a worker thread while the server socket is already open, so HERMES_DASHBOARD_READY fires without delay. get_status: replace the inline lazy import with await run_in_executor(None, _resolve_restart_drain_timeout). This is the root fix for the original 15 s socket-timeout: the blocking .pyc-compilation + Defender scan is offloaded to a thread, keeping the event loop free for every /api/status probe. After the first call the module is in sys.modules and the executor returns in microseconds. Both helpers are extracted as module-level sync functions so they can be unit-tested independently of FastAPI or uvicorn. Closes #50209 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
5e3e89cc05
commit
475e81dab4
1 changed files with 33 additions and 13 deletions
|
|
@ -144,6 +144,22 @@ def _start_desktop_cron_ticker(stop_event: "threading.Event", interval: int = 60
|
|||
provider.start(stop_event, interval=interval)
|
||||
|
||||
|
||||
def _warm_gateway_module() -> None:
|
||||
try:
|
||||
import hermes_cli.gateway # noqa: F401
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _resolve_restart_drain_timeout() -> float:
|
||||
try:
|
||||
from hermes_cli.gateway import _get_restart_drain_timeout
|
||||
return _get_restart_drain_timeout()
|
||||
except ImportError:
|
||||
from gateway.restart import DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT
|
||||
return DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def _lifespan(app: "FastAPI"):
|
||||
app.state.event_channels = {} # dict[str, set]
|
||||
|
|
@ -154,6 +170,14 @@ async def _lifespan(app: "FastAPI"):
|
|||
# event loop during lifespan startup — see _get_event_state's docstring.
|
||||
app.state.chat_argv_lock = asyncio.Lock()
|
||||
|
||||
# Fire hermes_cli.gateway import into a background thread so the event
|
||||
# loop is not blocked and HERMES_DASHBOARD_READY fires without delay.
|
||||
# On a cold Windows install the module chain triggers .pyc compilation
|
||||
# and Defender real-time scans that can stall the event loop for 15-30s.
|
||||
# Running in an executor means the cost is paid in a worker thread while
|
||||
# the server socket is already open and accepting probes.
|
||||
asyncio.get_event_loop().run_in_executor(None, _warm_gateway_module)
|
||||
|
||||
# Desktop-spawned backends (HERMES_DESKTOP=1) fire cron jobs themselves,
|
||||
# since the app has no gateway running the scheduler. Server `hermes
|
||||
# dashboard` is unaffected — it relies on its own gateway.
|
||||
|
|
@ -1855,19 +1879,15 @@ async def get_status(profile: Optional[str] = None):
|
|||
gateway_state=gateway_state,
|
||||
)
|
||||
# Resolved drain timeout (seconds) so NAS can size its poll deadline
|
||||
# without out-of-band knowledge. Reuse the single resolver
|
||||
# (HERMES_RESTART_DRAIN_TIMEOUT env → config agent.restart_drain_timeout
|
||||
# → default) rather than re-deriving the precedence chain here.
|
||||
try:
|
||||
from hermes_cli.gateway import _get_restart_drain_timeout
|
||||
|
||||
restart_drain_timeout = _get_restart_drain_timeout()
|
||||
except ImportError:
|
||||
# Resolver moved/renamed — fall back to the real default so the
|
||||
# field stays a numeric poll-deadline hint, never None.
|
||||
from gateway.restart import DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT
|
||||
|
||||
restart_drain_timeout = DEFAULT_GATEWAY_RESTART_DRAIN_TIMEOUT
|
||||
# without out-of-band knowledge. Offload to a thread: on a cold
|
||||
# Windows install the first import of hermes_cli.gateway blocks the
|
||||
# asyncio event loop for 15-30s (.pyc compilation + Defender scans),
|
||||
# exceeding the desktop handshake's 15s socket timeout. After the
|
||||
# first call the module is in sys.modules and run_in_executor returns
|
||||
# in microseconds.
|
||||
restart_drain_timeout = await asyncio.get_running_loop().run_in_executor(
|
||||
None, _resolve_restart_drain_timeout
|
||||
)
|
||||
|
||||
# Dashboard auth gate (Phase 7): surface whether the gate is engaged
|
||||
# and which providers are registered so ``hermes status`` and the
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue