fix(mcp): propagate HERMES_HOME override onto the MCP event loop (#44220)

* fix(mcp): propagate HERMES_HOME override onto the MCP event loop

Closes the known limit documented in #44007: tasks scheduled via
run_coroutine_threadsafe are created INSIDE the MCP loop thread, so they
copy that thread's context — a per-request profile scope (dashboard
?profile= endpoints, e.g. the MCP 'Test server' probe) silently vanished
for anything resolving get_hermes_home() inside the coroutine. Most
visible symptom: OAuth token-store paths (HERMES_HOME/mcp-tokens/)
resolved against the process home instead of the selected profile, so
testing an OAuth MCP cross-profile read the wrong tokens.

_run_on_mcp_loop now wraps scheduled coroutines with the caller's
context-local override (_wrap_with_home_override): set inside the task's
own context on the loop, reset on completion — task-local, so concurrent
calls carrying different scopes don't interfere, and the loop thread's
default context stays untouched. No-op (coroutine passes through
unwrapped) when no override is active, i.e. every non-dashboard caller.

web_server's probe comment updated from 'known limit' to 'covered'.

Tests: override propagation (direct + factory form), OAuth token-path
resolution on the loop, loop-context cleanliness after scoped calls,
no-op passthrough. 225 green across mcp_tool + unification suites.

* test(mcp): concurrent different-scope calls don't interfere
This commit is contained in:
Teknium 2026-06-11 04:37:01 -07:00 committed by GitHub
parent 3edd09a46f
commit 73dd584995
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 188 additions and 4 deletions

View file

@ -6577,9 +6577,10 @@ async def test_mcp_server(name: str, profile: Optional[str] = None):
# selected profile, matching the config the server was saved into.
# (asyncio.to_thread copies contextvars, but entering explicitly
# keeps the lock-protected SKILLS_DIR swap balanced per-thread.)
# Known limit: the dedicated MCP event-loop thread spawned by the
# probe doesn't inherit the contextvar, so OAuth token-store paths
# resolve against the process HERMES_HOME.
# The probe's dedicated MCP event-loop thread is covered too:
# _run_on_mcp_loop wraps scheduled coroutines with the caller's
# HERMES_HOME override (see mcp_tool._wrap_with_home_override), so
# OAuth token stores resolve against the selected profile as well.
with _profile_scope(profile):
return _probe_single_server(name, servers[name])

View file

@ -0,0 +1,139 @@
"""Regression tests for HERMES_HOME override propagation onto the MCP loop.
Tasks scheduled via run_coroutine_threadsafe are created inside the MCP
event-loop thread, so they copy THAT thread's context — not the scheduling
thread's. A per-request profile scope (dashboard ?profile= endpoints, e.g.
the MCP "Test server" probe) would silently vanish for anything resolving
get_hermes_home() inside the coroutine, most visibly OAuth token-store
paths. _run_on_mcp_loop now wraps scheduled coroutines with the caller's
override (mcp_tool._wrap_with_home_override).
"""
import os
import pytest
@pytest.fixture
def mcp_loop():
import tools.mcp_tool as mcp_tool
mcp_tool._ensure_mcp_loop()
yield mcp_tool
mcp_tool._stop_mcp_loop()
def test_override_propagates_to_mcp_loop(tmp_path, monkeypatch, mcp_loop):
from hermes_constants import (
get_hermes_home,
reset_hermes_home_override,
set_hermes_home_override,
)
process_home = tmp_path / "proc-home"
profile_home = tmp_path / "profile-home"
process_home.mkdir()
profile_home.mkdir()
monkeypatch.setenv("HERMES_HOME", str(process_home))
async def read_home():
return str(get_hermes_home())
# Unscoped: the loop task sees the process home.
assert mcp_loop._run_on_mcp_loop(read_home(), timeout=10) == str(process_home)
# Scoped: the caller's override must reach the loop task.
token = set_hermes_home_override(str(profile_home))
try:
assert mcp_loop._run_on_mcp_loop(read_home(), timeout=10) == str(profile_home)
# Factory form must be wrapped too.
assert mcp_loop._run_on_mcp_loop(lambda: read_home(), timeout=10) == str(
profile_home
)
finally:
reset_hermes_home_override(token)
# The loop thread's default context is untouched afterwards.
assert mcp_loop._run_on_mcp_loop(read_home(), timeout=10) == str(process_home)
def test_oauth_token_paths_follow_override(tmp_path, monkeypatch, mcp_loop):
"""The actual symptom path: HermesTokenStorage resolving inside the
probe's MCP-loop coroutine must land in the selected profile's
mcp-tokens dir, not the process home's."""
from hermes_constants import (
reset_hermes_home_override,
set_hermes_home_override,
)
process_home = tmp_path / "proc-home"
profile_home = tmp_path / "profile-home"
process_home.mkdir()
profile_home.mkdir()
monkeypatch.setenv("HERMES_HOME", str(process_home))
async def token_path():
from tools.mcp_oauth import HermesTokenStorage
return str(HermesTokenStorage("probe-srv")._tokens_path())
token = set_hermes_home_override(str(profile_home))
try:
path = mcp_loop._run_on_mcp_loop(token_path(), timeout=10)
finally:
reset_hermes_home_override(token)
assert path.startswith(str(profile_home))
assert os.path.join("mcp-tokens", "probe-srv.json") in path
def test_concurrent_scopes_do_not_interfere(tmp_path, monkeypatch, mcp_loop):
"""Two threads carrying DIFFERENT overrides scheduling onto the same
loop must each see their own home the wrapper is task-local."""
import threading
from hermes_constants import (
get_hermes_home,
reset_hermes_home_override,
set_hermes_home_override,
)
process_home = tmp_path / "proc-home"
home_a = tmp_path / "profile-a"
home_b = tmp_path / "profile-b"
for h in (process_home, home_a, home_b):
h.mkdir()
monkeypatch.setenv("HERMES_HOME", str(process_home))
async def read_home():
return str(get_hermes_home())
results: dict = {}
def scoped_call(key, home):
token = set_hermes_home_override(str(home))
try:
results[key] = mcp_loop._run_on_mcp_loop(read_home(), timeout=10)
finally:
reset_hermes_home_override(token)
threads = [
threading.Thread(target=scoped_call, args=("a", home_a)),
threading.Thread(target=scoped_call, args=("b", home_b)),
]
for t in threads:
t.start()
for t in threads:
t.join(timeout=15)
assert results == {"a": str(home_a), "b": str(home_b)}
def test_wrap_is_noop_without_override(mcp_loop):
"""No active override → the coroutine passes through unwrapped."""
async def trivial():
return 42
coro = trivial()
wrapped = mcp_loop._wrap_with_home_override(coro)
assert wrapped is coro
coro.close()

View file

@ -90,7 +90,7 @@ import sys
import threading
import time
from datetime import datetime
from typing import Any, Dict, List, Optional
from typing import Any, Coroutine, Dict, List, Optional
from urllib.parse import urlparse
logger = logging.getLogger(__name__)
@ -2460,6 +2460,37 @@ def _ensure_mcp_loop():
_mcp_thread.start()
def _wrap_with_home_override(coro: "Coroutine") -> "Coroutine":
"""Carry the caller's context-local HERMES_HOME override into ``coro``.
Returns ``coro`` unchanged when no override is active. Otherwise wraps
it so the override is set inside the coroutine's own (task-local)
context on the MCP loop and reset when it completes concurrent calls
carrying different scopes don't interfere.
"""
try:
from hermes_constants import (
get_hermes_home_override,
reset_hermes_home_override,
set_hermes_home_override,
)
home_override = get_hermes_home_override()
except Exception:
return coro
if not home_override:
return coro
async def _scoped():
token = set_hermes_home_override(home_override)
try:
return await coro
finally:
reset_hermes_home_override(token)
return _scoped()
def _run_on_mcp_loop(coro_or_factory, timeout: float = 30):
"""Schedule a coroutine on the MCP event loop and block until done.
@ -2482,6 +2513,19 @@ def _run_on_mcp_loop(coro_or_factory, timeout: float = 30):
raise RuntimeError("MCP event loop is not running")
coro = coro_or_factory() if callable(coro_or_factory) else coro_or_factory
# Propagate the context-local HERMES_HOME override onto the MCP loop.
# Tasks scheduled via run_coroutine_threadsafe are created INSIDE the
# loop thread, so they copy the loop thread's context — not the
# scheduling thread's. A per-request profile scope (the dashboard's
# ?profile= endpoints, e.g. the MCP "Test server" probe) would silently
# vanish here: OAuth token stores and any other get_hermes_home()
# resolution inside the coroutine would read the process home instead
# of the selected profile's. Re-establish the override inside the
# task's own context (task-local — concurrent calls carrying different
# scopes don't interfere). No-op when no override is active.
coro = _wrap_with_home_override(coro)
future = safe_schedule_threadsafe(
coro, loop,
logger=logger,