diff --git a/hermes_cli/mcp_startup.py b/hermes_cli/mcp_startup.py index e4d34c75e06..ad9996e65b9 100644 --- a/hermes_cli/mcp_startup.py +++ b/hermes_cli/mcp_startup.py @@ -98,3 +98,33 @@ def wait_for_mcp_discovery(timeout: "float | None" = None) -> None: if thread is None or not thread.is_alive(): return thread.join(timeout=_resolve_discovery_timeout(timeout)) + + +def mcp_discovery_in_flight() -> bool: + """Return True if THIS module's background discovery thread is still running. + + Mirrors ``tui_gateway.entry.mcp_discovery_in_flight`` for the surfaces that + start discovery through ``start_background_mcp_discovery`` here (the desktop + app + dashboard WebSocket sidecar via ``tui_gateway/ws.py``, and + ``hermes dashboard``). Those processes populate THIS module's + ``_mcp_discovery_thread``, not ``tui_gateway.entry``'s, so the late-refresh + scheduler must consult both to decide whether a slow server's tools are + still pending (see #51587). + """ + thread = _mcp_discovery_thread + return thread is not None and thread.is_alive() + + +def join_mcp_discovery(timeout: "float | None" = None) -> bool: + """Block until THIS module's background discovery finishes, up to ``timeout``. + + Returns True if discovery has completed (thread absent or no longer alive), + False if it is still running after the timeout. Unlike + ``wait_for_mcp_discovery`` this accepts an unbounded/long wait and reports + the outcome, for the off-critical-path late-refresh waiter. + """ + thread = _mcp_discovery_thread + if thread is None: + return True + thread.join(timeout=timeout) + return not thread.is_alive() diff --git a/tests/tui_gateway/test_mcp_late_refresh_thread_owner.py b/tests/tui_gateway/test_mcp_late_refresh_thread_owner.py new file mode 100644 index 00000000000..b793b9fd048 --- /dev/null +++ b/tests/tui_gateway/test_mcp_late_refresh_thread_owner.py @@ -0,0 +1,119 @@ +"""Regression test for issue #51587. + +MCP tools connected/enabled but never surfaced into the agent's session +toolset on the desktop app + dashboard WebUI. + +Root cause: there are two independent background MCP discovery thread owners +by surface: + + * ``tui_gateway.entry`` — the stdio ``hermes --tui`` path. + * ``hermes_cli.mcp_startup`` — the desktop app + dashboard WebSocket sidecar + (``tui_gateway/ws.py``) and ``hermes dashboard``. + +The late-refresh scheduler (``tui_gateway.server._schedule_mcp_late_refresh``) +gates on ``tui_gateway.entry.mcp_discovery_in_flight()``. Before the fix that +function read ONLY ``tui_gateway.entry._mcp_discovery_thread``. On the +desktop/dashboard surfaces that thread is ``None`` (the thread lives on +``hermes_cli.mcp_startup``), so the scheduler bailed immediately and a slow MCP +server's tools never surfaced for the whole session — even after a container +restart. The fix makes ``mcp_discovery_in_flight`` / ``join_mcp_discovery`` +consult BOTH thread owners. +""" + +import threading + +import pytest + +import hermes_cli.mcp_startup as startup +import tui_gateway.entry as entry + + +@pytest.fixture +def clean_discovery_globals(): + """Snapshot and restore both modules' discovery-thread globals.""" + saved_entry = entry._mcp_discovery_thread + saved_startup = startup._mcp_discovery_thread + entry._mcp_discovery_thread = None + startup._mcp_discovery_thread = None + try: + yield + finally: + entry._mcp_discovery_thread = saved_entry + startup._mcp_discovery_thread = saved_startup + + +def _alive_thread(stop: threading.Event) -> threading.Thread: + t = threading.Thread(target=lambda: stop.wait(5.0), daemon=True) + t.start() + return t + + +def test_entry_in_flight_sees_startup_thread(clean_discovery_globals): + """Desktop/dashboard surface: discovery thread lives on hermes_cli.mcp_startup. + + The entry-level in-flight check must report True so the late-refresh + scheduler does not bail (the #51587 bug). + """ + stop = threading.Event() + startup._mcp_discovery_thread = _alive_thread(stop) + try: + # Entry's own thread is None, but the startup thread is alive. + assert entry._mcp_discovery_thread is None + assert entry.mcp_discovery_in_flight() is True + finally: + stop.set() + startup._mcp_discovery_thread.join(timeout=2.0) + + # After the thread exits, neither owner is in flight. + assert entry.mcp_discovery_in_flight() is False + + +def test_entry_join_waits_on_startup_thread(clean_discovery_globals): + """join_mcp_discovery must report not-done while the startup thread runs.""" + stop = threading.Event() + t = _alive_thread(stop) + startup._mcp_discovery_thread = t + + assert entry.join_mcp_discovery(timeout=0.1) is False + + stop.set() + t.join(timeout=2.0) + assert entry.join_mcp_discovery(timeout=2.0) is True + + +def test_entry_in_flight_still_sees_own_thread(clean_discovery_globals): + """stdio TUI surface: discovery thread lives on tui_gateway.entry (unchanged).""" + stop = threading.Event() + entry._mcp_discovery_thread = _alive_thread(stop) + try: + assert startup._mcp_discovery_thread is None + assert entry.mcp_discovery_in_flight() is True + finally: + stop.set() + entry._mcp_discovery_thread.join(timeout=2.0) + + assert entry.mcp_discovery_in_flight() is False + + +def test_no_mcp_threads_not_in_flight(clean_discovery_globals): + """No discovery anywhere → not in flight, join reports done immediately.""" + assert entry.mcp_discovery_in_flight() is False + assert entry.join_mcp_discovery(timeout=0.1) is True + + +def test_startup_module_exposes_in_flight_helpers(clean_discovery_globals): + """hermes_cli.mcp_startup gains the in-flight/join helpers entry delegates to.""" + assert startup.mcp_discovery_in_flight() is False + assert startup.join_mcp_discovery(timeout=0.1) is True + + stop = threading.Event() + t = _alive_thread(stop) + startup._mcp_discovery_thread = t + try: + assert startup.mcp_discovery_in_flight() is True + assert startup.join_mcp_discovery(timeout=0.1) is False + finally: + stop.set() + t.join(timeout=2.0) + assert startup.mcp_discovery_in_flight() is False + assert startup.join_mcp_discovery(timeout=2.0) is True diff --git a/tui_gateway/entry.py b/tui_gateway/entry.py index 0a39e69766e..c738330f6dc 100644 --- a/tui_gateway/entry.py +++ b/tui_gateway/entry.py @@ -233,30 +233,61 @@ def wait_for_mcp_discovery(timeout: "float | None" = None) -> None: def mcp_discovery_in_flight() -> bool: - """Return True if the background MCP discovery thread is still running. + """Return True if ANY background MCP discovery thread is still running. Used by the agent-build path to decide whether to schedule a late tool snapshot refresh: if discovery didn't land within the bounded ``wait_for_mcp_discovery`` join, the agent was built without those tools and the banner/tool count will be stale until they arrive. + + There are two independent discovery-thread owners by surface: the stdio + ``hermes --tui`` path spawns ITS thread here (``_mcp_discovery_thread``), + while the desktop app + dashboard WebSocket sidecar (``tui_gateway/ws.py``) + and ``hermes dashboard`` spawn theirs via + ``hermes_cli.mcp_startup.start_background_mcp_discovery``. The late-refresh + scheduler imports this function regardless of surface, so it MUST consult + both — checking only the entry thread left the desktop/dashboard surfaces + with no late refresh, so a slow MCP server's tools never surfaced for the + whole session (#51587). """ thread = _mcp_discovery_thread - return thread is not None and thread.is_alive() + if thread is not None and thread.is_alive(): + return True + try: + from hermes_cli.mcp_startup import ( + mcp_discovery_in_flight as _startup_in_flight, + ) + + return _startup_in_flight() + except Exception: + return False def join_mcp_discovery(timeout: float | None = None) -> bool: """Block until background MCP discovery finishes, up to ``timeout`` seconds. - Returns True if discovery has completed (thread absent or no longer alive), - False if it is still running after the timeout. Unlike + Returns True if discovery has completed (both thread owners absent or no + longer alive), False if either is still running after the timeout. Unlike ``wait_for_mcp_discovery`` this accepts an unbounded/long wait and reports the outcome, for the off-critical-path late-refresh waiter. + + Joins both discovery-thread owners (see ``mcp_discovery_in_flight``): the + entry thread first, then the ``hermes_cli.mcp_startup`` thread used by the + desktop/dashboard surfaces. ``timeout`` bounds EACH join, mirroring the + pre-#51587 single-owner behavior for the entry thread. """ + entry_done = True thread = _mcp_discovery_thread - if thread is None: - return True - thread.join(timeout=timeout) - return not thread.is_alive() + if thread is not None: + thread.join(timeout=timeout) + entry_done = not thread.is_alive() + try: + from hermes_cli.mcp_startup import join_mcp_discovery as _startup_join + + startup_done = _startup_join(timeout=timeout) + except Exception: + startup_done = True + return entry_done and startup_done def main():