mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
fix(mcp): late-refresh must see desktop/dashboard discovery thread owner (#55514)
MCP tools connected and enabled but never surfaced into the agent's session toolset on the desktop app + dashboard WebUI (#51587). There are two independent background MCP discovery thread owners by surface: tui_gateway.entry (stdio 'hermes --tui') and hermes_cli.mcp_startup (desktop app + dashboard WS sidecar via tui_gateway/ws.py, and 'hermes dashboard'). The late-refresh scheduler gates on tui_gateway.entry.mcp_discovery_in_flight(), which read ONLY the entry thread global. On the desktop/dashboard surfaces that global is None, so a server slower than the bounded build-time wait never triggered a late refresh and its tools stayed invisible for the whole session. Make mcp_discovery_in_flight() / join_mcp_discovery() consult BOTH thread owners. Adds the matching in-flight/join helpers to hermes_cli.mcp_startup and has tui_gateway.entry delegate to them as a second owner.
This commit is contained in:
parent
fa3ab2ffd0
commit
3f19df2a5b
3 changed files with 188 additions and 8 deletions
|
|
@ -98,3 +98,33 @@ def wait_for_mcp_discovery(timeout: "float | None" = None) -> None:
|
|||
if thread is None or not thread.is_alive():
|
||||
return
|
||||
thread.join(timeout=_resolve_discovery_timeout(timeout))
|
||||
|
||||
|
||||
def mcp_discovery_in_flight() -> bool:
|
||||
"""Return True if THIS module's background discovery thread is still running.
|
||||
|
||||
Mirrors ``tui_gateway.entry.mcp_discovery_in_flight`` for the surfaces that
|
||||
start discovery through ``start_background_mcp_discovery`` here (the desktop
|
||||
app + dashboard WebSocket sidecar via ``tui_gateway/ws.py``, and
|
||||
``hermes dashboard``). Those processes populate THIS module's
|
||||
``_mcp_discovery_thread``, not ``tui_gateway.entry``'s, so the late-refresh
|
||||
scheduler must consult both to decide whether a slow server's tools are
|
||||
still pending (see #51587).
|
||||
"""
|
||||
thread = _mcp_discovery_thread
|
||||
return thread is not None and thread.is_alive()
|
||||
|
||||
|
||||
def join_mcp_discovery(timeout: "float | None" = None) -> bool:
|
||||
"""Block until THIS module's background discovery finishes, up to ``timeout``.
|
||||
|
||||
Returns True if discovery has completed (thread absent or no longer alive),
|
||||
False if it is still running after the timeout. Unlike
|
||||
``wait_for_mcp_discovery`` this accepts an unbounded/long wait and reports
|
||||
the outcome, for the off-critical-path late-refresh waiter.
|
||||
"""
|
||||
thread = _mcp_discovery_thread
|
||||
if thread is None:
|
||||
return True
|
||||
thread.join(timeout=timeout)
|
||||
return not thread.is_alive()
|
||||
|
|
|
|||
119
tests/tui_gateway/test_mcp_late_refresh_thread_owner.py
Normal file
119
tests/tui_gateway/test_mcp_late_refresh_thread_owner.py
Normal file
|
|
@ -0,0 +1,119 @@
|
|||
"""Regression test for issue #51587.
|
||||
|
||||
MCP tools connected/enabled but never surfaced into the agent's session
|
||||
toolset on the desktop app + dashboard WebUI.
|
||||
|
||||
Root cause: there are two independent background MCP discovery thread owners
|
||||
by surface:
|
||||
|
||||
* ``tui_gateway.entry`` — the stdio ``hermes --tui`` path.
|
||||
* ``hermes_cli.mcp_startup`` — the desktop app + dashboard WebSocket sidecar
|
||||
(``tui_gateway/ws.py``) and ``hermes dashboard``.
|
||||
|
||||
The late-refresh scheduler (``tui_gateway.server._schedule_mcp_late_refresh``)
|
||||
gates on ``tui_gateway.entry.mcp_discovery_in_flight()``. Before the fix that
|
||||
function read ONLY ``tui_gateway.entry._mcp_discovery_thread``. On the
|
||||
desktop/dashboard surfaces that thread is ``None`` (the thread lives on
|
||||
``hermes_cli.mcp_startup``), so the scheduler bailed immediately and a slow MCP
|
||||
server's tools never surfaced for the whole session — even after a container
|
||||
restart. The fix makes ``mcp_discovery_in_flight`` / ``join_mcp_discovery``
|
||||
consult BOTH thread owners.
|
||||
"""
|
||||
|
||||
import threading
|
||||
|
||||
import pytest
|
||||
|
||||
import hermes_cli.mcp_startup as startup
|
||||
import tui_gateway.entry as entry
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def clean_discovery_globals():
|
||||
"""Snapshot and restore both modules' discovery-thread globals."""
|
||||
saved_entry = entry._mcp_discovery_thread
|
||||
saved_startup = startup._mcp_discovery_thread
|
||||
entry._mcp_discovery_thread = None
|
||||
startup._mcp_discovery_thread = None
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
entry._mcp_discovery_thread = saved_entry
|
||||
startup._mcp_discovery_thread = saved_startup
|
||||
|
||||
|
||||
def _alive_thread(stop: threading.Event) -> threading.Thread:
|
||||
t = threading.Thread(target=lambda: stop.wait(5.0), daemon=True)
|
||||
t.start()
|
||||
return t
|
||||
|
||||
|
||||
def test_entry_in_flight_sees_startup_thread(clean_discovery_globals):
|
||||
"""Desktop/dashboard surface: discovery thread lives on hermes_cli.mcp_startup.
|
||||
|
||||
The entry-level in-flight check must report True so the late-refresh
|
||||
scheduler does not bail (the #51587 bug).
|
||||
"""
|
||||
stop = threading.Event()
|
||||
startup._mcp_discovery_thread = _alive_thread(stop)
|
||||
try:
|
||||
# Entry's own thread is None, but the startup thread is alive.
|
||||
assert entry._mcp_discovery_thread is None
|
||||
assert entry.mcp_discovery_in_flight() is True
|
||||
finally:
|
||||
stop.set()
|
||||
startup._mcp_discovery_thread.join(timeout=2.0)
|
||||
|
||||
# After the thread exits, neither owner is in flight.
|
||||
assert entry.mcp_discovery_in_flight() is False
|
||||
|
||||
|
||||
def test_entry_join_waits_on_startup_thread(clean_discovery_globals):
|
||||
"""join_mcp_discovery must report not-done while the startup thread runs."""
|
||||
stop = threading.Event()
|
||||
t = _alive_thread(stop)
|
||||
startup._mcp_discovery_thread = t
|
||||
|
||||
assert entry.join_mcp_discovery(timeout=0.1) is False
|
||||
|
||||
stop.set()
|
||||
t.join(timeout=2.0)
|
||||
assert entry.join_mcp_discovery(timeout=2.0) is True
|
||||
|
||||
|
||||
def test_entry_in_flight_still_sees_own_thread(clean_discovery_globals):
|
||||
"""stdio TUI surface: discovery thread lives on tui_gateway.entry (unchanged)."""
|
||||
stop = threading.Event()
|
||||
entry._mcp_discovery_thread = _alive_thread(stop)
|
||||
try:
|
||||
assert startup._mcp_discovery_thread is None
|
||||
assert entry.mcp_discovery_in_flight() is True
|
||||
finally:
|
||||
stop.set()
|
||||
entry._mcp_discovery_thread.join(timeout=2.0)
|
||||
|
||||
assert entry.mcp_discovery_in_flight() is False
|
||||
|
||||
|
||||
def test_no_mcp_threads_not_in_flight(clean_discovery_globals):
|
||||
"""No discovery anywhere → not in flight, join reports done immediately."""
|
||||
assert entry.mcp_discovery_in_flight() is False
|
||||
assert entry.join_mcp_discovery(timeout=0.1) is True
|
||||
|
||||
|
||||
def test_startup_module_exposes_in_flight_helpers(clean_discovery_globals):
|
||||
"""hermes_cli.mcp_startup gains the in-flight/join helpers entry delegates to."""
|
||||
assert startup.mcp_discovery_in_flight() is False
|
||||
assert startup.join_mcp_discovery(timeout=0.1) is True
|
||||
|
||||
stop = threading.Event()
|
||||
t = _alive_thread(stop)
|
||||
startup._mcp_discovery_thread = t
|
||||
try:
|
||||
assert startup.mcp_discovery_in_flight() is True
|
||||
assert startup.join_mcp_discovery(timeout=0.1) is False
|
||||
finally:
|
||||
stop.set()
|
||||
t.join(timeout=2.0)
|
||||
assert startup.mcp_discovery_in_flight() is False
|
||||
assert startup.join_mcp_discovery(timeout=2.0) is True
|
||||
|
|
@ -233,30 +233,61 @@ def wait_for_mcp_discovery(timeout: "float | None" = None) -> None:
|
|||
|
||||
|
||||
def mcp_discovery_in_flight() -> bool:
|
||||
"""Return True if the background MCP discovery thread is still running.
|
||||
"""Return True if ANY background MCP discovery thread is still running.
|
||||
|
||||
Used by the agent-build path to decide whether to schedule a late tool
|
||||
snapshot refresh: if discovery didn't land within the bounded
|
||||
``wait_for_mcp_discovery`` join, the agent was built without those tools
|
||||
and the banner/tool count will be stale until they arrive.
|
||||
|
||||
There are two independent discovery-thread owners by surface: the stdio
|
||||
``hermes --tui`` path spawns ITS thread here (``_mcp_discovery_thread``),
|
||||
while the desktop app + dashboard WebSocket sidecar (``tui_gateway/ws.py``)
|
||||
and ``hermes dashboard`` spawn theirs via
|
||||
``hermes_cli.mcp_startup.start_background_mcp_discovery``. The late-refresh
|
||||
scheduler imports this function regardless of surface, so it MUST consult
|
||||
both — checking only the entry thread left the desktop/dashboard surfaces
|
||||
with no late refresh, so a slow MCP server's tools never surfaced for the
|
||||
whole session (#51587).
|
||||
"""
|
||||
thread = _mcp_discovery_thread
|
||||
return thread is not None and thread.is_alive()
|
||||
if thread is not None and thread.is_alive():
|
||||
return True
|
||||
try:
|
||||
from hermes_cli.mcp_startup import (
|
||||
mcp_discovery_in_flight as _startup_in_flight,
|
||||
)
|
||||
|
||||
return _startup_in_flight()
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def join_mcp_discovery(timeout: float | None = None) -> bool:
|
||||
"""Block until background MCP discovery finishes, up to ``timeout`` seconds.
|
||||
|
||||
Returns True if discovery has completed (thread absent or no longer alive),
|
||||
False if it is still running after the timeout. Unlike
|
||||
Returns True if discovery has completed (both thread owners absent or no
|
||||
longer alive), False if either is still running after the timeout. Unlike
|
||||
``wait_for_mcp_discovery`` this accepts an unbounded/long wait and reports
|
||||
the outcome, for the off-critical-path late-refresh waiter.
|
||||
|
||||
Joins both discovery-thread owners (see ``mcp_discovery_in_flight``): the
|
||||
entry thread first, then the ``hermes_cli.mcp_startup`` thread used by the
|
||||
desktop/dashboard surfaces. ``timeout`` bounds EACH join, mirroring the
|
||||
pre-#51587 single-owner behavior for the entry thread.
|
||||
"""
|
||||
entry_done = True
|
||||
thread = _mcp_discovery_thread
|
||||
if thread is None:
|
||||
return True
|
||||
thread.join(timeout=timeout)
|
||||
return not thread.is_alive()
|
||||
if thread is not None:
|
||||
thread.join(timeout=timeout)
|
||||
entry_done = not thread.is_alive()
|
||||
try:
|
||||
from hermes_cli.mcp_startup import join_mcp_discovery as _startup_join
|
||||
|
||||
startup_done = _startup_join(timeout=timeout)
|
||||
except Exception:
|
||||
startup_done = True
|
||||
return entry_done and startup_done
|
||||
|
||||
|
||||
def main():
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue