From 6305ac0e4b5e8fd429108e65704db822e363f477 Mon Sep 17 00:00:00 2001 From: qdaszx Date: Thu, 25 Jun 2026 02:37:21 +0530 Subject: [PATCH] fix(mcp): run OSV malware preflight off the event loop with a bounded timeout (#29184) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During stdio MCP server startup, _run_stdio (an async method) called the synchronous check_package_for_malware() inline. That makes a blocking urllib HTTPS POST to api.osv.dev whose own timeout doesn't reliably cover a stalled SSL handshake, so an intermittent network issue froze the entire asyncio event loop for up to ~120s — blowing past the TUI/gateway's 15s startup budget and showing "gateway startup timeout". Run the check via asyncio.to_thread (off the loop) AND bound it with asyncio.wait_for(timeout=_OSV_MALWARE_CHECK_TIMEOUT_S=12s). The malware check is fail-open, so on timeout we log and proceed rather than blocking startup. Salvaged from #29190 by @qdaszx (re-applied on current main — the call site moved since the PR was opened), combining the to_thread approach also proposed in #29192 by @ygd58. Two load-bearing tests: event-loop-not-blocked-during- check and timeout-fails-open — both mutation-verified to fail against the old inline blocking call. Closes #29184. Co-authored-by: ygd58 --- scripts/release.py | 1 + tests/tools/test_mcp_tool_issue_948.py | 82 ++++++++++++++++++++++++++ tools/mcp_tool.py | 29 ++++++++- 3 files changed, 110 insertions(+), 2 deletions(-) diff --git a/scripts/release.py b/scripts/release.py index a3f01adf0c7..997b9d32310 100755 --- a/scripts/release.py +++ b/scripts/release.py @@ -65,6 +65,7 @@ AUTHOR_MAP = { "96944678+sweetcornna@users.noreply.github.com": "sweetcornna", # cron ticker-liveness salvage co-author (#33849) "izumi0uu@gmail.com": "izumi0uu", # PR #49544 salvage (native rich reply echo; #49534) "dev@pixlmedia.no": "texhy", # PR #27435 salvage (few-but-huge preflight compression gate; #27405) + "qdaszx@naver.com": "qdaszx", # PR #29190 salvage (non-blocking OSV malware preflight; #29184) "w31rdm4ch1n3z@protonmail.com": "w31rdm4ch1nZ", "xtpeeps@gmail.com": "x7peeps", "ahmad@madsgency.com": "ahmadashfq", diff --git a/tests/tools/test_mcp_tool_issue_948.py b/tests/tools/test_mcp_tool_issue_948.py index aefb32481df..c258cd570c7 100644 --- a/tests/tools/test_mcp_tool_issue_948.py +++ b/tests/tools/test_mcp_tool_issue_948.py @@ -126,3 +126,85 @@ def test_run_stdio_uses_resolved_command_and_prepended_path(tmp_path): await server.shutdown() asyncio.run(_test()) + + +# --------------------------------------------------------------------------- +# #29184: OSV malware preflight must not block the asyncio event loop, and a +# stalled check must time out fail-open rather than freezing MCP startup. +# --------------------------------------------------------------------------- + + +def _stdio_mocks(): + mock_session = MagicMock() + mock_session.initialize = AsyncMock() + mock_session.list_tools = AsyncMock(return_value=SimpleNamespace(tools=[])) + mock_stdio_cm = MagicMock() + mock_stdio_cm.__aenter__ = AsyncMock(return_value=(object(), object())) + mock_stdio_cm.__aexit__ = AsyncMock(return_value=False) + mock_session_cm = MagicMock() + mock_session_cm.__aenter__ = AsyncMock(return_value=mock_session) + mock_session_cm.__aexit__ = AsyncMock(return_value=False) + return mock_stdio_cm, mock_session_cm + + +def test_run_stdio_malware_check_does_not_block_event_loop(): + """The blocking OSV check runs off the loop (asyncio.to_thread), so a + concurrent coroutine keeps making progress while it runs.""" + import time + mock_stdio_cm, mock_session_cm = _stdio_mocks() + + def slow_check(_command, _args): + time.sleep(0.3) # simulate a slow OSV HTTPS call + return None + + ticks = {"n": 0} + + async def _ticker(): + # If the loop were blocked, these ticks would not advance during the + # 0.3s check. + for _ in range(20): + await asyncio.sleep(0.01) + ticks["n"] += 1 + + async def _test(): + with patch("tools.osv_check.check_package_for_malware", side_effect=slow_check), \ + patch("tools.mcp_tool.StdioServerParameters"), \ + patch("tools.mcp_tool.stdio_client", return_value=mock_stdio_cm), \ + patch("tools.mcp_tool.ClientSession", return_value=mock_session_cm): + server = MCPServerTask("srv") + ticker = asyncio.create_task(_ticker()) + await server.start({"command": "npx", "args": ["-y", "pkg"]}) + ticks_during = ticks["n"] + await ticker + await server.shutdown() + # The loop kept ticking DURING the 0.3s blocking check -> not blocked. + assert ticks_during >= 3, f"event loop appeared blocked (ticks={ticks_during})" + + asyncio.run(_test()) + + +def test_run_stdio_malware_check_times_out_fail_open(): + """A check that hangs past the timeout must NOT freeze startup: it times + out, logs, and proceeds (fail-open) so the server still starts.""" + import time + mock_stdio_cm, mock_session_cm = _stdio_mocks() + + def hung_check(_command, _args): + time.sleep(0.5) # outlasts the 0.2s timeout 2.5x; short enough not to stall teardown + return "MALWARE" # would block startup if awaited to completion + + async def _test(): + with patch("tools.osv_check.check_package_for_malware", side_effect=hung_check), \ + patch("tools.mcp_tool._OSV_MALWARE_CHECK_TIMEOUT_S", 0.2), \ + patch("tools.mcp_tool.StdioServerParameters"), \ + patch("tools.mcp_tool.stdio_client", return_value=mock_stdio_cm), \ + patch("tools.mcp_tool.ClientSession", return_value=mock_session_cm): + server = MCPServerTask("srv") + start = time.monotonic() + await server.start({"command": "npx", "args": ["-y", "pkg"]}) + elapsed = time.monotonic() - start + await server.shutdown() + # Returned shortly after the 0.2s timeout (fail-open), not the 0.5s hang. + assert elapsed < 1.0, f"startup did not fail-open promptly ({elapsed:.1f}s)" + + asyncio.run(_test()) diff --git a/tools/mcp_tool.py b/tools/mcp_tool.py index c31215ae09a..dead8ca2046 100644 --- a/tools/mcp_tool.py +++ b/tools/mcp_tool.py @@ -101,6 +101,16 @@ from urllib.parse import urlparse logger = logging.getLogger(__name__) +# Upper bound for the OSV malware preflight during stdio MCP startup. The +# check makes a blocking urllib HTTPS call whose own timeout can fail to +# interrupt a stalled SSL handshake, which froze the asyncio event loop and +# blew past the gateway's 15s startup budget (#29184). We run it off the loop +# AND bound it here; the check is fail-open, so a timeout lets startup proceed. +# Set just ABOVE osv_check._TIMEOUT (10s) so the inner socket timeout fires +# first in the normal case; this outer bound only bites when a stalled SSL +# handshake defeats the inner timeout (the #29184 failure mode). +_OSV_MALWARE_CHECK_TIMEOUT_S = 12.0 + # --------------------------------------------------------------------------- # Stdio subprocess stderr redirection @@ -1772,9 +1782,24 @@ class MCPServerTask: safe_env = _build_safe_env(user_env) command, safe_env = _resolve_stdio_command(command, safe_env) - # Check package against OSV malware database before spawning + # Check package against OSV malware database before spawning. + # Run off the event loop (the urllib HTTPS call is blocking) and bound + # it with a wall-clock timeout so a stalled SSL handshake can't freeze + # MCP discovery / gateway startup (#29184). The check is fail-open, so + # on timeout we log and proceed rather than blocking indefinitely. from tools.osv_check import check_package_for_malware - malware_error = check_package_for_malware(command, args) + try: + malware_error = await asyncio.wait_for( + asyncio.to_thread(check_package_for_malware, command, args), + timeout=_OSV_MALWARE_CHECK_TIMEOUT_S, + ) + except asyncio.TimeoutError: + logger.warning( + "MCP server '%s': OSV malware preflight timed out after %.0fs " + "(network slow/unreachable) — proceeding without the check.", + self.name, _OSV_MALWARE_CHECK_TIMEOUT_S, + ) + malware_error = None if malware_error: raise ValueError( f"MCP server '{self.name}': {malware_error}"