fix(mcp): late-refresh must see desktop/dashboard discovery thread owner (#55514)

MCP tools connected and enabled but never surfaced into the agent's
session toolset on the desktop app + dashboard WebUI (#51587).

There are two independent background MCP discovery thread owners by
surface: tui_gateway.entry (stdio 'hermes --tui') and hermes_cli.mcp_startup
(desktop app + dashboard WS sidecar via tui_gateway/ws.py, and 'hermes
dashboard'). The late-refresh scheduler gates on
tui_gateway.entry.mcp_discovery_in_flight(), which read ONLY the entry
thread global. On the desktop/dashboard surfaces that global is None, so a
server slower than the bounded build-time wait never triggered a late
refresh and its tools stayed invisible for the whole session.

Make mcp_discovery_in_flight() / join_mcp_discovery() consult BOTH thread
owners. Adds the matching in-flight/join helpers to hermes_cli.mcp_startup
and has tui_gateway.entry delegate to them as a second owner.
This commit is contained in:
Teknium 2026-06-30 02:08:37 -07:00 committed by GitHub
parent fa3ab2ffd0
commit 3f19df2a5b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 188 additions and 8 deletions

View file

@ -98,3 +98,33 @@ def wait_for_mcp_discovery(timeout: "float | None" = None) -> None:
if thread is None or not thread.is_alive():
return
thread.join(timeout=_resolve_discovery_timeout(timeout))
def mcp_discovery_in_flight() -> bool:
"""Return True if THIS module's background discovery thread is still running.
Mirrors ``tui_gateway.entry.mcp_discovery_in_flight`` for the surfaces that
start discovery through ``start_background_mcp_discovery`` here (the desktop
app + dashboard WebSocket sidecar via ``tui_gateway/ws.py``, and
``hermes dashboard``). Those processes populate THIS module's
``_mcp_discovery_thread``, not ``tui_gateway.entry``'s, so the late-refresh
scheduler must consult both to decide whether a slow server's tools are
still pending (see #51587).
"""
thread = _mcp_discovery_thread
return thread is not None and thread.is_alive()
def join_mcp_discovery(timeout: "float | None" = None) -> bool:
"""Block until THIS module's background discovery finishes, up to ``timeout``.
Returns True if discovery has completed (thread absent or no longer alive),
False if it is still running after the timeout. Unlike
``wait_for_mcp_discovery`` this accepts an unbounded/long wait and reports
the outcome, for the off-critical-path late-refresh waiter.
"""
thread = _mcp_discovery_thread
if thread is None:
return True
thread.join(timeout=timeout)
return not thread.is_alive()

View file

@ -0,0 +1,119 @@
"""Regression test for issue #51587.
MCP tools connected/enabled but never surfaced into the agent's session
toolset on the desktop app + dashboard WebUI.
Root cause: there are two independent background MCP discovery thread owners
by surface:
* ``tui_gateway.entry`` the stdio ``hermes --tui`` path.
* ``hermes_cli.mcp_startup`` the desktop app + dashboard WebSocket sidecar
(``tui_gateway/ws.py``) and ``hermes dashboard``.
The late-refresh scheduler (``tui_gateway.server._schedule_mcp_late_refresh``)
gates on ``tui_gateway.entry.mcp_discovery_in_flight()``. Before the fix that
function read ONLY ``tui_gateway.entry._mcp_discovery_thread``. On the
desktop/dashboard surfaces that thread is ``None`` (the thread lives on
``hermes_cli.mcp_startup``), so the scheduler bailed immediately and a slow MCP
server's tools never surfaced for the whole session — even after a container
restart. The fix makes ``mcp_discovery_in_flight`` / ``join_mcp_discovery``
consult BOTH thread owners.
"""
import threading
import pytest
import hermes_cli.mcp_startup as startup
import tui_gateway.entry as entry
@pytest.fixture
def clean_discovery_globals():
"""Snapshot and restore both modules' discovery-thread globals."""
saved_entry = entry._mcp_discovery_thread
saved_startup = startup._mcp_discovery_thread
entry._mcp_discovery_thread = None
startup._mcp_discovery_thread = None
try:
yield
finally:
entry._mcp_discovery_thread = saved_entry
startup._mcp_discovery_thread = saved_startup
def _alive_thread(stop: threading.Event) -> threading.Thread:
t = threading.Thread(target=lambda: stop.wait(5.0), daemon=True)
t.start()
return t
def test_entry_in_flight_sees_startup_thread(clean_discovery_globals):
"""Desktop/dashboard surface: discovery thread lives on hermes_cli.mcp_startup.
The entry-level in-flight check must report True so the late-refresh
scheduler does not bail (the #51587 bug).
"""
stop = threading.Event()
startup._mcp_discovery_thread = _alive_thread(stop)
try:
# Entry's own thread is None, but the startup thread is alive.
assert entry._mcp_discovery_thread is None
assert entry.mcp_discovery_in_flight() is True
finally:
stop.set()
startup._mcp_discovery_thread.join(timeout=2.0)
# After the thread exits, neither owner is in flight.
assert entry.mcp_discovery_in_flight() is False
def test_entry_join_waits_on_startup_thread(clean_discovery_globals):
"""join_mcp_discovery must report not-done while the startup thread runs."""
stop = threading.Event()
t = _alive_thread(stop)
startup._mcp_discovery_thread = t
assert entry.join_mcp_discovery(timeout=0.1) is False
stop.set()
t.join(timeout=2.0)
assert entry.join_mcp_discovery(timeout=2.0) is True
def test_entry_in_flight_still_sees_own_thread(clean_discovery_globals):
"""stdio TUI surface: discovery thread lives on tui_gateway.entry (unchanged)."""
stop = threading.Event()
entry._mcp_discovery_thread = _alive_thread(stop)
try:
assert startup._mcp_discovery_thread is None
assert entry.mcp_discovery_in_flight() is True
finally:
stop.set()
entry._mcp_discovery_thread.join(timeout=2.0)
assert entry.mcp_discovery_in_flight() is False
def test_no_mcp_threads_not_in_flight(clean_discovery_globals):
"""No discovery anywhere → not in flight, join reports done immediately."""
assert entry.mcp_discovery_in_flight() is False
assert entry.join_mcp_discovery(timeout=0.1) is True
def test_startup_module_exposes_in_flight_helpers(clean_discovery_globals):
"""hermes_cli.mcp_startup gains the in-flight/join helpers entry delegates to."""
assert startup.mcp_discovery_in_flight() is False
assert startup.join_mcp_discovery(timeout=0.1) is True
stop = threading.Event()
t = _alive_thread(stop)
startup._mcp_discovery_thread = t
try:
assert startup.mcp_discovery_in_flight() is True
assert startup.join_mcp_discovery(timeout=0.1) is False
finally:
stop.set()
t.join(timeout=2.0)
assert startup.mcp_discovery_in_flight() is False
assert startup.join_mcp_discovery(timeout=2.0) is True

View file

@ -233,30 +233,61 @@ def wait_for_mcp_discovery(timeout: "float | None" = None) -> None:
def mcp_discovery_in_flight() -> bool:
"""Return True if the background MCP discovery thread is still running.
"""Return True if ANY background MCP discovery thread is still running.
Used by the agent-build path to decide whether to schedule a late tool
snapshot refresh: if discovery didn't land within the bounded
``wait_for_mcp_discovery`` join, the agent was built without those tools
and the banner/tool count will be stale until they arrive.
There are two independent discovery-thread owners by surface: the stdio
``hermes --tui`` path spawns ITS thread here (``_mcp_discovery_thread``),
while the desktop app + dashboard WebSocket sidecar (``tui_gateway/ws.py``)
and ``hermes dashboard`` spawn theirs via
``hermes_cli.mcp_startup.start_background_mcp_discovery``. The late-refresh
scheduler imports this function regardless of surface, so it MUST consult
both checking only the entry thread left the desktop/dashboard surfaces
with no late refresh, so a slow MCP server's tools never surfaced for the
whole session (#51587).
"""
thread = _mcp_discovery_thread
return thread is not None and thread.is_alive()
if thread is not None and thread.is_alive():
return True
try:
from hermes_cli.mcp_startup import (
mcp_discovery_in_flight as _startup_in_flight,
)
return _startup_in_flight()
except Exception:
return False
def join_mcp_discovery(timeout: float | None = None) -> bool:
"""Block until background MCP discovery finishes, up to ``timeout`` seconds.
Returns True if discovery has completed (thread absent or no longer alive),
False if it is still running after the timeout. Unlike
Returns True if discovery has completed (both thread owners absent or no
longer alive), False if either is still running after the timeout. Unlike
``wait_for_mcp_discovery`` this accepts an unbounded/long wait and reports
the outcome, for the off-critical-path late-refresh waiter.
Joins both discovery-thread owners (see ``mcp_discovery_in_flight``): the
entry thread first, then the ``hermes_cli.mcp_startup`` thread used by the
desktop/dashboard surfaces. ``timeout`` bounds EACH join, mirroring the
pre-#51587 single-owner behavior for the entry thread.
"""
entry_done = True
thread = _mcp_discovery_thread
if thread is None:
return True
thread.join(timeout=timeout)
return not thread.is_alive()
if thread is not None:
thread.join(timeout=timeout)
entry_done = not thread.is_alive()
try:
from hermes_cli.mcp_startup import join_mcp_discovery as _startup_join
startup_done = _startup_join(timeout=timeout)
except Exception:
startup_done = True
return entry_done and startup_done
def main():