Merge pull request #52147 from NousResearch/salvage/29184-mcp-osv-nonblocking

fix(mcp): run OSV malware preflight off the event loop with a bounded timeout (#29184)
This commit is contained in:
kshitij 2026-06-25 23:39:44 +05:30 committed by GitHub
commit d682f320b3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 110 additions and 2 deletions

View file

@ -66,6 +66,7 @@ AUTHOR_MAP = {
"96944678+sweetcornna@users.noreply.github.com": "sweetcornna", # cron ticker-liveness salvage co-author (#33849)
"izumi0uu@gmail.com": "izumi0uu", # PR #49544 salvage (native rich reply echo; #49534)
"dev@pixlmedia.no": "texhy", # PR #27435 salvage (few-but-huge preflight compression gate; #27405)
"qdaszx@naver.com": "qdaszx", # PR #29190 salvage (non-blocking OSV malware preflight; #29184)
"w31rdm4ch1n3z@protonmail.com": "w31rdm4ch1nZ",
"xtpeeps@gmail.com": "x7peeps",
"ahmad@madsgency.com": "ahmadashfq",

View file

@ -126,3 +126,85 @@ def test_run_stdio_uses_resolved_command_and_prepended_path(tmp_path):
await server.shutdown()
asyncio.run(_test())
# ---------------------------------------------------------------------------
# #29184: OSV malware preflight must not block the asyncio event loop, and a
# stalled check must time out fail-open rather than freezing MCP startup.
# ---------------------------------------------------------------------------
def _stdio_mocks():
mock_session = MagicMock()
mock_session.initialize = AsyncMock()
mock_session.list_tools = AsyncMock(return_value=SimpleNamespace(tools=[]))
mock_stdio_cm = MagicMock()
mock_stdio_cm.__aenter__ = AsyncMock(return_value=(object(), object()))
mock_stdio_cm.__aexit__ = AsyncMock(return_value=False)
mock_session_cm = MagicMock()
mock_session_cm.__aenter__ = AsyncMock(return_value=mock_session)
mock_session_cm.__aexit__ = AsyncMock(return_value=False)
return mock_stdio_cm, mock_session_cm
def test_run_stdio_malware_check_does_not_block_event_loop():
"""The blocking OSV check runs off the loop (asyncio.to_thread), so a
concurrent coroutine keeps making progress while it runs."""
import time
mock_stdio_cm, mock_session_cm = _stdio_mocks()
def slow_check(_command, _args):
time.sleep(0.3) # simulate a slow OSV HTTPS call
return None
ticks = {"n": 0}
async def _ticker():
# If the loop were blocked, these ticks would not advance during the
# 0.3s check.
for _ in range(20):
await asyncio.sleep(0.01)
ticks["n"] += 1
async def _test():
with patch("tools.osv_check.check_package_for_malware", side_effect=slow_check), \
patch("tools.mcp_tool.StdioServerParameters"), \
patch("tools.mcp_tool.stdio_client", return_value=mock_stdio_cm), \
patch("tools.mcp_tool.ClientSession", return_value=mock_session_cm):
server = MCPServerTask("srv")
ticker = asyncio.create_task(_ticker())
await server.start({"command": "npx", "args": ["-y", "pkg"]})
ticks_during = ticks["n"]
await ticker
await server.shutdown()
# The loop kept ticking DURING the 0.3s blocking check -> not blocked.
assert ticks_during >= 3, f"event loop appeared blocked (ticks={ticks_during})"
asyncio.run(_test())
def test_run_stdio_malware_check_times_out_fail_open():
"""A check that hangs past the timeout must NOT freeze startup: it times
out, logs, and proceeds (fail-open) so the server still starts."""
import time
mock_stdio_cm, mock_session_cm = _stdio_mocks()
def hung_check(_command, _args):
time.sleep(0.5) # outlasts the 0.2s timeout 2.5x; short enough not to stall teardown
return "MALWARE" # would block startup if awaited to completion
async def _test():
with patch("tools.osv_check.check_package_for_malware", side_effect=hung_check), \
patch("tools.mcp_tool._OSV_MALWARE_CHECK_TIMEOUT_S", 0.2), \
patch("tools.mcp_tool.StdioServerParameters"), \
patch("tools.mcp_tool.stdio_client", return_value=mock_stdio_cm), \
patch("tools.mcp_tool.ClientSession", return_value=mock_session_cm):
server = MCPServerTask("srv")
start = time.monotonic()
await server.start({"command": "npx", "args": ["-y", "pkg"]})
elapsed = time.monotonic() - start
await server.shutdown()
# Returned shortly after the 0.2s timeout (fail-open), not the 0.5s hang.
assert elapsed < 1.0, f"startup did not fail-open promptly ({elapsed:.1f}s)"
asyncio.run(_test())

View file

@ -101,6 +101,16 @@ from urllib.parse import urlparse
logger = logging.getLogger(__name__)
# Upper bound for the OSV malware preflight during stdio MCP startup. The
# check makes a blocking urllib HTTPS call whose own timeout can fail to
# interrupt a stalled SSL handshake, which froze the asyncio event loop and
# blew past the gateway's 15s startup budget (#29184). We run it off the loop
# AND bound it here; the check is fail-open, so a timeout lets startup proceed.
# Set just ABOVE osv_check._TIMEOUT (10s) so the inner socket timeout fires
# first in the normal case; this outer bound only bites when a stalled SSL
# handshake defeats the inner timeout (the #29184 failure mode).
_OSV_MALWARE_CHECK_TIMEOUT_S = 12.0
# ---------------------------------------------------------------------------
# Stdio subprocess stderr redirection
@ -1772,9 +1782,24 @@ class MCPServerTask:
safe_env = _build_safe_env(user_env)
command, safe_env = _resolve_stdio_command(command, safe_env)
# Check package against OSV malware database before spawning
# Check package against OSV malware database before spawning.
# Run off the event loop (the urllib HTTPS call is blocking) and bound
# it with a wall-clock timeout so a stalled SSL handshake can't freeze
# MCP discovery / gateway startup (#29184). The check is fail-open, so
# on timeout we log and proceed rather than blocking indefinitely.
from tools.osv_check import check_package_for_malware
malware_error = check_package_for_malware(command, args)
try:
malware_error = await asyncio.wait_for(
asyncio.to_thread(check_package_for_malware, command, args),
timeout=_OSV_MALWARE_CHECK_TIMEOUT_S,
)
except asyncio.TimeoutError:
logger.warning(
"MCP server '%s': OSV malware preflight timed out after %.0fs "
"(network slow/unreachable) — proceeding without the check.",
self.name, _OSV_MALWARE_CHECK_TIMEOUT_S,
)
malware_error = None
if malware_error:
raise ValueError(
f"MCP server '{self.name}': {malware_error}"