diff --git a/tests/tools/test_mcp_utility_capability_gating.py b/tests/tools/test_mcp_utility_capability_gating.py new file mode 100644 index 0000000000..971711d75c --- /dev/null +++ b/tests/tools/test_mcp_utility_capability_gating.py @@ -0,0 +1,175 @@ +"""Regression tests for capability-gated MCP utility schema registration. + +Background +========== +For every connected MCP server, hermes-agent used to register four "utility" +tool schemas (``mcp__list_resources``, ``read_resource``, +``list_prompts``, ``get_prompt``) regardless of whether the server actually +advertises those capabilities. The old gate used ``hasattr(server.session, +method)`` which always returned True because ``mcp.ClientSession`` defines +all four methods on the class — independent of what the remote server +supports. + +Tools-only servers like ``@upstash/context7-mcp`` advertise +``{\"tools\": {\"listChanged\": true}}`` in their ``initialize`` response — +no ``prompts`` or ``resources`` keys — and they return JSON-RPC +``-32601 Method not found`` for ``prompts/list``, ``prompts/get``, +``resources/list``, ``resources/read``. The model would try the stubs, +get the error, and incorrectly conclude the MCP server was broken. + +The fix captures the ``InitializeResult`` from +``await session.initialize()`` into ``MCPServerTask.initialize_result`` +and gates utility schema registration on the advertised +``capabilities.resources`` / ``capabilities.prompts`` sub-objects. See +#18051 for the reporter's repro (Context7) and analysis. +""" + +from __future__ import annotations + +from types import SimpleNamespace +from unittest.mock import MagicMock + +import pytest + + +def _make_init_result(*, resources: bool, prompts: bool): + """Build a fake ``InitializeResult`` whose ``capabilities`` sub-object + matches a server that advertises exactly the given capability set. + + MCP spec shape: ``capabilities.resources`` / ``capabilities.prompts`` + are non-None iff the server implements the corresponding request + family. We mirror that with ``SimpleNamespace`` because the real SDK + models are pydantic and we don't want the test to couple to pydantic + versioning. + """ + caps_attrs: dict = {"tools": SimpleNamespace(listChanged=True)} + caps_attrs["resources"] = SimpleNamespace(listChanged=True) if resources else None + caps_attrs["prompts"] = SimpleNamespace(listChanged=True) if prompts else None + return SimpleNamespace(capabilities=SimpleNamespace(**caps_attrs)) + + +def _make_fake_server(*, initialize_result): + """Build a stand-in ``MCPServerTask`` that exposes just the fields + ``_select_utility_schemas`` inspects: ``name``, ``session``, + ``initialize_result``. + + A plain ``MCPServerTask`` uses ``__slots__`` and needs an asyncio + loop for the ``Event``/``Lock`` init — overkill for unit scope. + """ + server = MagicMock() + server.name = "test-server" + # session must satisfy the legacy ``hasattr`` fallback too + server.session = MagicMock( + spec=["list_resources", "read_resource", "list_prompts", "get_prompt"] + ) + server.initialize_result = initialize_result + return server + + +def _handler_keys(selected): + return {entry["handler_key"] for entry in selected} + + +class TestCapabilityGatedRegistration: + def test_tools_only_server_gets_no_utility_schemas(self): + """Context7-shaped server (tools only, no prompts / resources) should + get zero utility stubs registered — this is the exact scenario + from the #18051 bug report.""" + from tools.mcp_tool import _select_utility_schemas + + server = _make_fake_server( + initialize_result=_make_init_result(resources=False, prompts=False) + ) + selected = _select_utility_schemas("context7", server, {}) + assert _handler_keys(selected) == set(), ( + f"tools-only server should have zero utility stubs, got " + f"{_handler_keys(selected)}" + ) + + def test_resources_only_server_gets_resource_stubs_only(self): + from tools.mcp_tool import _select_utility_schemas + + server = _make_fake_server( + initialize_result=_make_init_result(resources=True, prompts=False) + ) + selected = _select_utility_schemas("res-only", server, {}) + assert _handler_keys(selected) == {"list_resources", "read_resource"} + + def test_prompts_only_server_gets_prompt_stubs_only(self): + from tools.mcp_tool import _select_utility_schemas + + server = _make_fake_server( + initialize_result=_make_init_result(resources=False, prompts=True) + ) + selected = _select_utility_schemas("prompt-only", server, {}) + assert _handler_keys(selected) == {"list_prompts", "get_prompt"} + + def test_fully_capable_server_gets_all_four_stubs(self): + from tools.mcp_tool import _select_utility_schemas + + server = _make_fake_server( + initialize_result=_make_init_result(resources=True, prompts=True) + ) + selected = _select_utility_schemas("full", server, {}) + assert _handler_keys(selected) == { + "list_resources", "read_resource", "list_prompts", "get_prompt", + } + + +class TestConfigFilterStillApplies: + """Per-server config flags ``tools.resources: false`` / ``tools.prompts: false`` + must continue to override even when the server DOES advertise the capability.""" + + def test_config_disables_resources_even_when_advertised(self): + from tools.mcp_tool import _select_utility_schemas + + server = _make_fake_server( + initialize_result=_make_init_result(resources=True, prompts=True) + ) + selected = _select_utility_schemas( + "full-but-filtered", + server, + {"tools": {"resources": False}}, + ) + assert _handler_keys(selected) == {"list_prompts", "get_prompt"} + + def test_config_disables_prompts_even_when_advertised(self): + from tools.mcp_tool import _select_utility_schemas + + server = _make_fake_server( + initialize_result=_make_init_result(resources=True, prompts=True) + ) + selected = _select_utility_schemas( + "full-but-filtered", + server, + {"tools": {"prompts": False}}, + ) + assert _handler_keys(selected) == {"list_resources", "read_resource"} + + +class TestLegacyFallback: + """When ``initialize_result`` is missing (older test fixtures or code + paths that haven't captured it yet), fall back to the legacy hasattr + check so pre-existing tests and servers keep working.""" + + def test_no_initialize_result_falls_back_to_hasattr_check(self): + from tools.mcp_tool import _select_utility_schemas + + server = _make_fake_server(initialize_result=None) + # With the legacy fallback, session.spec includes all four methods, + # so all four stubs should register (old behavior). + selected = _select_utility_schemas("legacy", server, {}) + assert _handler_keys(selected) == { + "list_resources", "read_resource", "list_prompts", "get_prompt", + } + + def test_no_initialize_result_respects_session_spec(self): + """Legacy fallback still filters by ``hasattr(session, method)``, so + a session whose spec lacks a method is correctly skipped.""" + from tools.mcp_tool import _select_utility_schemas + + server = _make_fake_server(initialize_result=None) + # Override session to a spec that only has list_resources + server.session = MagicMock(spec=["list_resources"]) + selected = _select_utility_schemas("legacy-partial", server, {}) + assert _handler_keys(selected) == {"list_resources"} diff --git a/tools/mcp_tool.py b/tools/mcp_tool.py index 95ac400fdb..73480ada9f 100644 --- a/tools/mcp_tool.py +++ b/tools/mcp_tool.py @@ -950,6 +950,7 @@ class MCPServerTask: "_tools", "_error", "_config", "_sampling", "_registered_tool_names", "_auth_type", "_refresh_lock", "_rpc_lock", "_pending_refresh_tasks", + "initialize_result", ) def __init__(self, name: str): @@ -980,6 +981,12 @@ class MCPServerTask: # transports for conservative per-server ordering. self._rpc_lock = asyncio.Lock() self._pending_refresh_tasks: set[asyncio.Task] = set() + # Captures the ``InitializeResult`` returned by + # ``await session.initialize()`` so downstream code can inspect the + # server's real advertised capabilities (``.capabilities.resources``, + # ``.capabilities.prompts``) instead of assuming every ``ClientSession`` + # method attribute corresponds to a supported server method. See #18051. + self.initialize_result: Optional[Any] = None def _is_http(self) -> bool: """Check if this server uses HTTP transport.""" @@ -1225,7 +1232,7 @@ class MCPServerTask: async with ClientSession( read_stream, write_stream, **sampling_kwargs ) as session: - await session.initialize() + self.initialize_result = await session.initialize() self.session = session await self._discover_tools() self._ready.set() @@ -1324,7 +1331,7 @@ class MCPServerTask: async with ClientSession( read_stream, write_stream, **sampling_kwargs ) as session: - await session.initialize() + self.initialize_result = await session.initialize() self.session = session await self._discover_tools() self._ready.set() @@ -1371,7 +1378,7 @@ class MCPServerTask: read_stream, write_stream, _get_session_id, ): async with ClientSession(read_stream, write_stream, **sampling_kwargs) as session: - await session.initialize() + self.initialize_result = await session.initialize() self.session = session await self._discover_tools() self._ready.set() @@ -1394,7 +1401,7 @@ class MCPServerTask: read_stream, write_stream, _get_session_id, ): async with ClientSession(read_stream, write_stream, **sampling_kwargs) as session: - await session.initialize() + self.initialize_result = await session.initialize() self.session = session await self._discover_tools() self._ready.set() @@ -2806,6 +2813,23 @@ _UTILITY_CAPABILITY_METHODS = { "get_prompt": "get_prompt", } +# Maps each utility handler to the MCP capability key that must be non-None +# on the server's ``initialize`` response for the handler to be registered. +# Source of truth: MCP spec — capabilities.resources / capabilities.prompts +# are present on the response only when the server actually implements +# those request families. Without this gate, tools-only servers (e.g. +# Context7 @upstash/context7-mcp, which advertises only ``tools``) had +# all four utility stubs registered and every model call to them came +# back with JSON-RPC ``-32601 Method not found``, which made the model +# conclude the server was broken even when the real tools worked. See +# #18051. +_UTILITY_CAPABILITY_ATTRS = { + "list_resources": "resources", + "read_resource": "resources", + "list_prompts": "prompts", + "get_prompt": "prompts", +} + def _select_utility_schemas(server_name: str, server: MCPServerTask, config: dict) -> List[dict]: """Select utility schemas based on config and server capabilities.""" @@ -2813,6 +2837,16 @@ def _select_utility_schemas(server_name: str, server: MCPServerTask, config: dic resources_enabled = _parse_boolish(tools_filter.get("resources"), default=True) prompts_enabled = _parse_boolish(tools_filter.get("prompts"), default=True) + # ``initialize_result.capabilities`` is the source of truth: its sub-objects + # (``resources``, ``prompts``) are non-None iff the server advertises that + # request family. ``hasattr(server.session, ...)`` was the old gate but + # ClientSession always has the four method attributes defined on the class, + # so it never filtered anything. + advertised_caps = None + init_result = getattr(server, "initialize_result", None) + if init_result is not None: + advertised_caps = getattr(init_result, "capabilities", None) + selected: List[dict] = [] for entry in _build_utility_schemas(server_name): handler_key = entry["handler_key"] @@ -2823,15 +2857,33 @@ def _select_utility_schemas(server_name: str, server: MCPServerTask, config: dic logger.debug("MCP server '%s': skipping utility '%s' (prompts disabled)", server_name, handler_key) continue - required_method = _UTILITY_CAPABILITY_METHODS[handler_key] - if not hasattr(server.session, required_method): - logger.debug( - "MCP server '%s': skipping utility '%s' (session lacks %s)", - server_name, - handler_key, - required_method, - ) - continue + # Preferred gate: check the server's advertised capabilities. Skip + # if the capability is explicitly not advertised. + if advertised_caps is not None: + cap_attr = _UTILITY_CAPABILITY_ATTRS[handler_key] + if getattr(advertised_caps, cap_attr, None) is None: + logger.debug( + "MCP server '%s': skipping utility '%s' " + "(server does not advertise '%s' capability)", + server_name, + handler_key, + cap_attr, + ) + continue + else: + # Legacy fallback for test fixtures or older code paths where + # initialize_result wasn't captured. Preserves the old behavior + # of registering every stub in that case rather than regressing + # any server that was working before this fix. + required_method = _UTILITY_CAPABILITY_METHODS[handler_key] + if not hasattr(server.session, required_method): + logger.debug( + "MCP server '%s': skipping utility '%s' (session lacks %s)", + server_name, + handler_key, + required_method, + ) + continue selected.append(entry) return selected