hermes-agent/tests/tools/test_mcp_tool_issue_948.py
qdaszx 6305ac0e4b fix(mcp): run OSV malware preflight off the event loop with a bounded timeout (#29184)
During stdio MCP server startup, _run_stdio (an async method) called the
synchronous check_package_for_malware() inline. That makes a blocking
urllib HTTPS POST to api.osv.dev whose own timeout doesn't reliably cover a
stalled SSL handshake, so an intermittent network issue froze the entire
asyncio event loop for up to ~120s — blowing past the TUI/gateway's 15s
startup budget and showing "gateway startup timeout".

Run the check via asyncio.to_thread (off the loop) AND bound it with
asyncio.wait_for(timeout=_OSV_MALWARE_CHECK_TIMEOUT_S=12s). The malware check
is fail-open, so on timeout we log and proceed rather than blocking startup.

Salvaged from #29190 by @qdaszx (re-applied on current main — the call site
moved since the PR was opened), combining the to_thread approach also proposed
in #29192 by @ygd58. Two load-bearing tests: event-loop-not-blocked-during-
check and timeout-fails-open — both mutation-verified to fail against the old
inline blocking call.

Closes #29184.

Co-authored-by: ygd58 <buraysandro9@gmail.com>
2026-06-25 23:30:41 +05:30

210 lines
8.6 KiB
Python

import asyncio
import os
from types import SimpleNamespace
from unittest.mock import AsyncMock, MagicMock, patch
from tools.mcp_tool import MCPServerTask, _format_connect_error, _resolve_stdio_command, _MCP_AVAILABLE
# Ensure the mcp module symbols exist for patching even when the SDK isn't installed
if not _MCP_AVAILABLE:
import tools.mcp_tool as _mcp_mod
if not hasattr(_mcp_mod, "StdioServerParameters"):
_mcp_mod.StdioServerParameters = MagicMock
if not hasattr(_mcp_mod, "stdio_client"):
_mcp_mod.stdio_client = MagicMock
if not hasattr(_mcp_mod, "ClientSession"):
_mcp_mod.ClientSession = MagicMock
def test_resolve_stdio_command_falls_back_to_hermes_node_bin(tmp_path):
node_bin = tmp_path / "node" / "bin"
node_bin.mkdir(parents=True)
npx_path = node_bin / "npx"
npx_path.write_text("#!/bin/sh\nexit 0\n", encoding="utf-8")
npx_path.chmod(0o755)
with patch("tools.mcp_tool.shutil.which", return_value=None), \
patch.dict("os.environ", {"HERMES_HOME": str(tmp_path)}, clear=False):
command, env = _resolve_stdio_command("npx", {"PATH": "/usr/bin"})
assert command == str(npx_path)
assert env["PATH"].split(os.pathsep)[0] == str(node_bin)
def test_resolve_stdio_command_falls_back_to_usr_local_bin():
"""When ``npx`` isn't on the filtered PATH and isn't under ``$HERMES_HOME/node/bin``
or ``~/.local/bin``, the resolver should still locate it at ``/usr/local/bin/npx``.
This is the canonical install location for Node on Linux from-source builds,
the upstream ``node:bookworm-slim`` image (which the Hermes Docker image
copies ``node + npm + corepack`` from since #4977), and macOS Homebrew on
Intel. Without this candidate, MCP servers run with an ``env.PATH`` that
omits ``/usr/local/bin`` (common when users hand-author PATH for sandboxing)
fail with ENOENT at ``execvp``.
"""
target = os.path.join(os.sep, "usr", "local", "bin", "npx")
# Pretend ONLY the /usr/local/bin/npx candidate exists and is executable —
# the other candidates ($HERMES_HOME/node/bin/npx and ~/.local/bin/npx)
# should fail isfile() and the resolver must fall through to /usr/local/bin.
def _fake_isfile(path):
return path == target
def _fake_access(path, _mode):
return path == target
with patch("tools.mcp_tool.shutil.which", return_value=None), \
patch("tools.mcp_tool.os.path.isfile", side_effect=_fake_isfile), \
patch("tools.mcp_tool.os.access", side_effect=_fake_access):
command, env = _resolve_stdio_command("npx", {"PATH": "/opt/data/bin:/usr/bin:/bin"})
assert command == target
# /usr/local/bin must be prepended so npx's shebang (`/usr/bin/env node`)
# can find node in the same directory.
assert env["PATH"].split(os.pathsep)[0] == os.path.dirname(target)
def test_resolve_stdio_command_respects_explicit_empty_path():
seen_paths = []
def _fake_which(_cmd, path=None):
seen_paths.append(path)
return None
with patch("tools.mcp_tool.shutil.which", side_effect=_fake_which):
command, env = _resolve_stdio_command("python", {"PATH": ""})
assert command == "python"
assert env["PATH"] == ""
assert seen_paths == [""]
def test_format_connect_error_unwraps_exception_group():
error = ExceptionGroup(
"unhandled errors in a TaskGroup",
[FileNotFoundError(2, "No such file or directory", "node")],
)
message = _format_connect_error(error)
assert "missing executable 'node'" in message
def test_run_stdio_uses_resolved_command_and_prepended_path(tmp_path):
node_bin = tmp_path / "node" / "bin"
node_bin.mkdir(parents=True)
npx_path = node_bin / "npx"
npx_path.write_text("#!/bin/sh\nexit 0\n", encoding="utf-8")
npx_path.chmod(0o755)
mock_session = MagicMock()
mock_session.initialize = AsyncMock()
mock_session.list_tools = AsyncMock(return_value=SimpleNamespace(tools=[]))
mock_stdio_cm = MagicMock()
mock_stdio_cm.__aenter__ = AsyncMock(return_value=(object(), object()))
mock_stdio_cm.__aexit__ = AsyncMock(return_value=False)
mock_session_cm = MagicMock()
mock_session_cm.__aenter__ = AsyncMock(return_value=mock_session)
mock_session_cm.__aexit__ = AsyncMock(return_value=False)
async def _test():
with patch("tools.mcp_tool.shutil.which", return_value=None), \
patch.dict("os.environ", {"HERMES_HOME": str(tmp_path), "PATH": "/usr/bin", "HOME": str(tmp_path)}, clear=False), \
patch("tools.mcp_tool.StdioServerParameters") as mock_params, \
patch("tools.mcp_tool.stdio_client", return_value=mock_stdio_cm), \
patch("tools.mcp_tool.ClientSession", return_value=mock_session_cm):
server = MCPServerTask("srv")
await server.start({"command": "npx", "args": ["-y", "pkg"], "env": {"PATH": "/usr/bin"}})
call_kwargs = mock_params.call_args.kwargs
assert call_kwargs["command"] == str(npx_path)
assert call_kwargs["env"]["PATH"].split(os.pathsep)[0] == str(node_bin)
await server.shutdown()
asyncio.run(_test())
# ---------------------------------------------------------------------------
# #29184: OSV malware preflight must not block the asyncio event loop, and a
# stalled check must time out fail-open rather than freezing MCP startup.
# ---------------------------------------------------------------------------
def _stdio_mocks():
mock_session = MagicMock()
mock_session.initialize = AsyncMock()
mock_session.list_tools = AsyncMock(return_value=SimpleNamespace(tools=[]))
mock_stdio_cm = MagicMock()
mock_stdio_cm.__aenter__ = AsyncMock(return_value=(object(), object()))
mock_stdio_cm.__aexit__ = AsyncMock(return_value=False)
mock_session_cm = MagicMock()
mock_session_cm.__aenter__ = AsyncMock(return_value=mock_session)
mock_session_cm.__aexit__ = AsyncMock(return_value=False)
return mock_stdio_cm, mock_session_cm
def test_run_stdio_malware_check_does_not_block_event_loop():
"""The blocking OSV check runs off the loop (asyncio.to_thread), so a
concurrent coroutine keeps making progress while it runs."""
import time
mock_stdio_cm, mock_session_cm = _stdio_mocks()
def slow_check(_command, _args):
time.sleep(0.3) # simulate a slow OSV HTTPS call
return None
ticks = {"n": 0}
async def _ticker():
# If the loop were blocked, these ticks would not advance during the
# 0.3s check.
for _ in range(20):
await asyncio.sleep(0.01)
ticks["n"] += 1
async def _test():
with patch("tools.osv_check.check_package_for_malware", side_effect=slow_check), \
patch("tools.mcp_tool.StdioServerParameters"), \
patch("tools.mcp_tool.stdio_client", return_value=mock_stdio_cm), \
patch("tools.mcp_tool.ClientSession", return_value=mock_session_cm):
server = MCPServerTask("srv")
ticker = asyncio.create_task(_ticker())
await server.start({"command": "npx", "args": ["-y", "pkg"]})
ticks_during = ticks["n"]
await ticker
await server.shutdown()
# The loop kept ticking DURING the 0.3s blocking check -> not blocked.
assert ticks_during >= 3, f"event loop appeared blocked (ticks={ticks_during})"
asyncio.run(_test())
def test_run_stdio_malware_check_times_out_fail_open():
"""A check that hangs past the timeout must NOT freeze startup: it times
out, logs, and proceeds (fail-open) so the server still starts."""
import time
mock_stdio_cm, mock_session_cm = _stdio_mocks()
def hung_check(_command, _args):
time.sleep(0.5) # outlasts the 0.2s timeout 2.5x; short enough not to stall teardown
return "MALWARE" # would block startup if awaited to completion
async def _test():
with patch("tools.osv_check.check_package_for_malware", side_effect=hung_check), \
patch("tools.mcp_tool._OSV_MALWARE_CHECK_TIMEOUT_S", 0.2), \
patch("tools.mcp_tool.StdioServerParameters"), \
patch("tools.mcp_tool.stdio_client", return_value=mock_stdio_cm), \
patch("tools.mcp_tool.ClientSession", return_value=mock_session_cm):
server = MCPServerTask("srv")
start = time.monotonic()
await server.start({"command": "npx", "args": ["-y", "pkg"]})
elapsed = time.monotonic() - start
await server.shutdown()
# Returned shortly after the 0.2s timeout (fail-open), not the 0.5s hang.
assert elapsed < 1.0, f"startup did not fail-open promptly ({elapsed:.1f}s)"
asyncio.run(_test())