mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
The desktop app and dashboard chat reach the agent through the /api/ws JSON-RPC sidecar (tui_gateway.ws.handle_ws), NOT through tui_gateway.entry.main() — the stdio-TUI path that spawns the background MCP discovery thread. In the WS process discovery was therefore never started: _make_agent only *waits* (wait_for_mcp_discovery), which no-ops when the thread was never created, so the agent snapshotted an MCP-less tool list. The only discovery trigger reachable was a manual /reload-mcp, which is why tools appeared after a reload but vanished on restart. Start the shared, idempotent, config-gated background discovery in handle_ws right after accept() and before gateway.ready, so the first agent build picks up already-spawning servers (and the existing late-binding refresh handles slow ones). Fixes #38945.
164 lines
5.4 KiB
Python
164 lines
5.4 KiB
Python
import asyncio
|
|
import threading
|
|
import time
|
|
|
|
from hermes_cli import mcp_startup
|
|
from tui_gateway import server
|
|
from tui_gateway import ws as ws_mod
|
|
|
|
|
|
def test_ws_startup_starts_background_mcp_discovery(monkeypatch):
|
|
"""The desktop app and dashboard chat reach the agent through this WS
|
|
sidecar, not through tui_gateway.entry.main() (which spawns the discovery
|
|
thread for the stdio TUI). handle_ws must start discovery itself, otherwise
|
|
_make_agent's wait_for_mcp_discovery no-ops and the agent snapshots an
|
|
MCP-less tool list. Regression test for #38945."""
|
|
calls = []
|
|
monkeypatch.setattr(
|
|
mcp_startup,
|
|
"start_background_mcp_discovery",
|
|
lambda **kw: calls.append(kw),
|
|
)
|
|
|
|
class FakeWS:
|
|
async def accept(self):
|
|
pass
|
|
|
|
async def send_text(self, line):
|
|
pass
|
|
|
|
async def receive_text(self):
|
|
raise ws_mod._WebSocketDisconnect()
|
|
|
|
async def close(self):
|
|
pass
|
|
|
|
server._sessions.clear()
|
|
try:
|
|
asyncio.run(ws_mod.handle_ws(FakeWS()))
|
|
finally:
|
|
server._sessions.clear()
|
|
|
|
assert calls == [{"logger": ws_mod._log, "thread_name": "tui-ws-mcp-discovery"}]
|
|
|
|
|
|
def _run_disconnect(monkeypatch, seed):
|
|
"""Drive handle_ws to its disconnect `finally`, seeding sessions against the
|
|
live WSTransport the moment it exists. Returns nothing; inspect _sessions."""
|
|
# Disable the grace-reap Timer: detached sessions normally schedule a
|
|
# threading.Timer via _schedule_ws_orphan_reap, which would outlive the test
|
|
# and fire _reap during interpreter teardown — touching _sessions/DB and
|
|
# producing spurious post-run errors under the per-file CI runner. Grace=0
|
|
# short-circuits the Timer (see _schedule_ws_orphan_reap) so the test leaves
|
|
# no lingering thread.
|
|
monkeypatch.setattr(server, "_WS_ORPHAN_REAP_GRACE_S", 0)
|
|
|
|
# Mirror the real _finalize_session chokepoint: it is the single place that
|
|
# closes the slash-worker (#38095). Stub it but keep that behavior so the
|
|
# disconnect-reap path still exercises worker teardown.
|
|
def _fake_finalize(s, end_reason="tui_close"):
|
|
w = s.get("slash_worker")
|
|
if w:
|
|
w.close()
|
|
|
|
monkeypatch.setattr(server, "_finalize_session", _fake_finalize)
|
|
|
|
created = []
|
|
real_transport = ws_mod.WSTransport
|
|
monkeypatch.setattr(
|
|
ws_mod, "WSTransport",
|
|
lambda ws, loop, **kw: created.append(real_transport(ws, loop, **kw)) or created[-1],
|
|
)
|
|
|
|
class FakeWS:
|
|
async def accept(self):
|
|
pass
|
|
|
|
async def send_text(self, line):
|
|
pass
|
|
|
|
async def receive_text(self):
|
|
seed(created[0]) # transport now exists; attach it to sessions
|
|
raise ws_mod._WebSocketDisconnect()
|
|
|
|
async def close(self):
|
|
pass
|
|
|
|
asyncio.run(ws_mod.handle_ws(FakeWS()))
|
|
|
|
|
|
def test_ws_disconnect_reaps_flagged_session_and_closes_worker(monkeypatch):
|
|
closed = []
|
|
|
|
class FakeWorker:
|
|
def close(self):
|
|
closed.append(True)
|
|
|
|
server._sessions.clear()
|
|
try:
|
|
_run_disconnect(
|
|
monkeypatch,
|
|
lambda t: server._sessions.update(
|
|
flagged={
|
|
"transport": t,
|
|
"close_on_disconnect": True,
|
|
"slash_worker": FakeWorker(),
|
|
"session_key": "k",
|
|
}
|
|
),
|
|
)
|
|
assert "flagged" not in server._sessions
|
|
assert closed == [True]
|
|
finally:
|
|
server._sessions.clear()
|
|
|
|
|
|
def test_ws_disconnect_preserves_and_repoints_reconnectable_session(monkeypatch):
|
|
server._sessions.clear()
|
|
try:
|
|
_run_disconnect(
|
|
monkeypatch,
|
|
lambda t: server._sessions.update(
|
|
plain={"transport": t, "close_on_disconnect": False, "session_key": "k"}
|
|
),
|
|
)
|
|
assert server._sessions["plain"]["transport"] is server._detached_ws_transport
|
|
finally:
|
|
server._sessions.clear()
|
|
|
|
|
|
def test_ws_write_loop_stall_does_not_latch_transport(monkeypatch):
|
|
"""A write that times out because the event loop is stalled (GIL-heavy
|
|
agent turn) must NOT latch the transport closed — the frame is already
|
|
scheduled and flushes when the loop recovers. Latching here permanently
|
|
silenced live watch windows after one slow write."""
|
|
monkeypatch.setattr(ws_mod, "_WS_WRITE_TIMEOUT_S", 0.05)
|
|
sent = []
|
|
|
|
class FakeWS:
|
|
async def send_text(self, line):
|
|
sent.append(line)
|
|
|
|
loop = asyncio.new_event_loop()
|
|
thread = threading.Thread(target=loop.run_forever, daemon=True)
|
|
thread.start()
|
|
try:
|
|
transport = ws_mod.WSTransport(FakeWS(), loop, peer="stall-test")
|
|
# Stall the loop well past the write timeout, then write from this
|
|
# (non-loop) thread: the wait times out but the send stays in flight.
|
|
loop.call_soon_threadsafe(time.sleep, 0.3)
|
|
assert transport.write({"a": 1}) is True
|
|
assert transport._closed is False
|
|
|
|
# Once the loop breathes again, both the stalled frame and new writes
|
|
# must reach the socket.
|
|
assert transport.write({"b": 2}) is True
|
|
deadline = time.time() + 2
|
|
while len(sent) < 2 and time.time() < deadline:
|
|
time.sleep(0.01)
|
|
assert len(sent) == 2
|
|
assert transport._closed is False
|
|
finally:
|
|
loop.call_soon_threadsafe(loop.stop)
|
|
thread.join(timeout=2)
|
|
loop.close()
|