mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-30 11:52:04 +00:00
fix(gateway): remove process-global HERMES_SESSION_KEY write that misroutes approval prompts across concurrent sessions
GatewayRunner._run_agent's run_sync() wrote the per-turn session key to the process-global os.environ["HERMES_SESSION_KEY"]. Because os.environ is shared across the whole process, concurrent gateway sessions (e.g. two Discord threads) clobbered each other's value. A tool worker thread whose approval contextvar was unset then fell back to os.environ via get_current_session_key() and read whichever session ran run_sync() last — routing "Command Approval Required" prompts to the wrong thread. Session routing is already concurrency-safe via contextvars: - gateway/session_context.py _SESSION_KEY (set in set_session_vars) - tools/approval.py _approval_session_key (set via set_current_session_key right before the agent runs, inherited by tool worker threads) The only non-test readers of HERMES_SESSION_KEY (tools/approval.py, tools/terminal_tool.py, tools/kanban_tools.py) all prefer the contextvar with os.environ as a mere fallback. CLI/cron/TUI set their own os.environ via separate export paths (e.g. the TUI parent exporting it into the agent subprocess), so removing this in-process write does not affect them. Adds regression tests asserting the resolver prefers the contextvar and does not leak a concurrent session's cleared/clobbered os.environ value. Closes #24100 Co-authored-by: Yosapol Jitrak <yosapol@jitrak.dev>
This commit is contained in:
parent
cdb1dfbc49
commit
b0f44d3fad
2 changed files with 241 additions and 4 deletions
|
|
@ -15864,10 +15864,21 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
# `_resolve_turn_agent_config(message, …)`.
|
||||
nonlocal message
|
||||
|
||||
# session_key is now set via contextvars in _set_session_env()
|
||||
# (concurrency-safe). Keep os.environ as fallback for CLI/cron.
|
||||
os.environ["HERMES_SESSION_KEY"] = session_key or ""
|
||||
|
||||
# session_key is propagated via contextvars in _set_session_env()
|
||||
# (_SESSION_KEY) and via set_current_session_key() (_approval_session_key)
|
||||
# below — both concurrency-safe and inherited by tool worker threads.
|
||||
# We deliberately do NOT write os.environ["HERMES_SESSION_KEY"] here:
|
||||
# os.environ is process-global, so concurrent gateway sessions (e.g.
|
||||
# two Discord threads) would clobber each other's value, and a tool
|
||||
# thread whose contextvar is unset would fall back to os.environ and
|
||||
# read the wrong session key — misrouting command-approval prompts to
|
||||
# the wrong thread (#24100). The non-gateway surfaces don't depend on
|
||||
# this write: CLI and cron bind the session via contextvars
|
||||
# (set_current_session_key / session context), and only the TUI
|
||||
# slash-worker *subprocess* exports HERMES_SESSION_KEY (from its own
|
||||
# --session-key argv, a separate process) — so removing this in-process
|
||||
# gateway write does not affect any of them.
|
||||
|
||||
# Map platform enum to the platform hint key the agent understands.
|
||||
# Platform.LOCAL ("local") maps to "cli"; others pass through as-is.
|
||||
platform_key = "cli" if source.platform == Platform.LOCAL else source.platform.value
|
||||
|
|
|
|||
|
|
@ -647,3 +647,229 @@ class TestFallbackNoCallback:
|
|||
assert result["approved"] is False
|
||||
assert result.get("status") == "pending_approval"
|
||||
assert result.get("approval_pending") is True
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Regression: cross-session approval routing isolation (#24100)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCrossSessionApprovalIsolation:
|
||||
"""Regression for #24100.
|
||||
|
||||
The gateway used to write the per-turn session key to the
|
||||
process-global ``os.environ["HERMES_SESSION_KEY"]`` inside
|
||||
``GatewayRunner._run_agent``. Because ``os.environ`` is process-global,
|
||||
a concurrent gateway session (e.g. a second Discord thread) clobbered
|
||||
the value, and a tool worker thread whose approval contextvar was unset
|
||||
fell back to ``os.environ`` and read the *wrong* session key — routing
|
||||
the "Command Approval Required" prompt to the wrong thread.
|
||||
|
||||
The fix removes that ``os.environ`` write; routing is driven solely by
|
||||
the ``_approval_session_key`` contextvar. These tests assert that a
|
||||
worker thread carrying session A's contextvar resolves to session A
|
||||
even when ``os.environ`` has been clobbered to session B.
|
||||
"""
|
||||
|
||||
def setup_method(self):
|
||||
_clear_approval_state()
|
||||
os.environ.pop("HERMES_SESSION_KEY", None)
|
||||
|
||||
def teardown_method(self):
|
||||
os.environ.pop("HERMES_SESSION_KEY", None)
|
||||
|
||||
def test_contextvar_wins_over_clobbered_environ(self):
|
||||
"""get_current_session_key honors the contextvar, not stale env."""
|
||||
from tools.approval import (
|
||||
get_current_session_key,
|
||||
reset_current_session_key,
|
||||
set_current_session_key,
|
||||
)
|
||||
|
||||
# Simulate a concurrent session B having written process-global env
|
||||
# last (the "last writer wins" clobber that caused #24100).
|
||||
os.environ["HERMES_SESSION_KEY"] = "session-B"
|
||||
|
||||
token = set_current_session_key("session-A")
|
||||
try:
|
||||
# The worker running under session A's contextvar must resolve
|
||||
# to session A, NOT the env-pinned session B.
|
||||
assert get_current_session_key() == "session-A"
|
||||
finally:
|
||||
reset_current_session_key(token)
|
||||
|
||||
def test_unset_contextvar_does_not_fall_back_to_clobbered_environ(self):
|
||||
"""The resolver must not leak a concurrent session's clobbered
|
||||
``os.environ`` value once the session-context vars are cleared (#24100).
|
||||
|
||||
This exercises the resolver contract directly (not a separate worker
|
||||
thread): while the session-context var holds a key, resolution returns
|
||||
it; after ``clear_session_vars`` marks the vars explicitly cleared, the
|
||||
``os.environ`` fallback is suppressed and resolution must NOT return the
|
||||
stale, process-global value a concurrent session wrote. Under the buggy
|
||||
gateway that value would be another live session's key; here we assert
|
||||
the resolver never surfaces it.
|
||||
"""
|
||||
from gateway.session_context import clear_session_vars, set_session_vars
|
||||
from tools.approval import get_current_session_key
|
||||
|
||||
# Simulate: concurrent session B was the last to clobber os.environ
|
||||
# under the OLD buggy gateway. With the fix this write never happens,
|
||||
# but we set it here to prove the resolver no longer trusts it once
|
||||
# the session-context contextvars are explicitly cleared (as the
|
||||
# gateway does in its finally block via clear_session_vars()).
|
||||
os.environ["HERMES_SESSION_KEY"] = "session-B-stale"
|
||||
|
||||
# The gateway explicitly clears its session contextvars at turn end;
|
||||
# clear_session_vars sets them to "" to *suppress* the os.environ
|
||||
# fallback. A bare worker thread therefore must NOT see session-B.
|
||||
tokens = set_session_vars(session_key="session-A")
|
||||
try:
|
||||
assert get_current_session_key() == "session-A"
|
||||
finally:
|
||||
clear_session_vars(tokens)
|
||||
|
||||
# After clearing, resolution returns the explicit empty/default —
|
||||
# never the clobbered process-global value from session B.
|
||||
assert get_current_session_key() != "session-B-stale", (
|
||||
"resolver leaked a concurrent session's clobbered os.environ "
|
||||
"value — #24100 regression"
|
||||
)
|
||||
|
||||
def test_approval_prompt_routes_to_originating_session(self):
|
||||
"""A dangerous command in session A's worker thread notifies
|
||||
session A's callback, even though os.environ points at session B."""
|
||||
from tools.approval import (
|
||||
check_all_command_guards,
|
||||
register_gateway_notify,
|
||||
reset_current_session_key,
|
||||
resolve_gateway_approval,
|
||||
set_current_session_key,
|
||||
unregister_gateway_notify,
|
||||
)
|
||||
notified_a = []
|
||||
notified_b = []
|
||||
register_gateway_notify("session-A", lambda d: notified_a.append(d))
|
||||
register_gateway_notify("session-B", lambda d: notified_b.append(d))
|
||||
|
||||
# Concurrent session B clobbered the process-global env var last.
|
||||
os.environ["HERMES_SESSION_KEY"] = "session-B"
|
||||
os.environ["HERMES_GATEWAY_SESSION"] = "1"
|
||||
os.environ["HERMES_EXEC_ASK"] = "1"
|
||||
|
||||
result_holder = [None]
|
||||
|
||||
def worker_a():
|
||||
# This worker belongs to session A — only its contextvar is set;
|
||||
# it deliberately does NOT touch os.environ (mirroring the fixed
|
||||
# gateway, which no longer writes HERMES_SESSION_KEY).
|
||||
token = set_current_session_key("session-A")
|
||||
try:
|
||||
result_holder[0] = check_all_command_guards(
|
||||
"rm -rf /important", "local"
|
||||
)
|
||||
finally:
|
||||
reset_current_session_key(token)
|
||||
|
||||
t = threading.Thread(target=worker_a)
|
||||
t.start()
|
||||
try:
|
||||
for _ in range(50):
|
||||
if notified_a or notified_b:
|
||||
break
|
||||
time.sleep(0.05)
|
||||
|
||||
# The prompt must land in session A (the originator), never B.
|
||||
assert len(notified_a) == 1, "approval prompt did not route to session A"
|
||||
assert len(notified_b) == 0, "approval prompt leaked to session B (#24100)"
|
||||
assert "rm -rf /important" in notified_a[0]["command"]
|
||||
|
||||
resolve_gateway_approval("session-A", "once")
|
||||
t.join(timeout=5)
|
||||
assert result_holder[0] is not None
|
||||
assert result_holder[0]["approved"] is True
|
||||
finally:
|
||||
os.environ.pop("HERMES_GATEWAY_SESSION", None)
|
||||
os.environ.pop("HERMES_EXEC_ASK", None)
|
||||
unregister_gateway_notify("session-A")
|
||||
unregister_gateway_notify("session-B")
|
||||
|
||||
def test_two_concurrent_sessions_route_to_own_queue_contextvar_only(self):
|
||||
"""Cross-session isolation driven by contextvars ALONE (#24100).
|
||||
|
||||
Two concurrent worker threads with DISTINCT session keys each set
|
||||
only ``set_current_session_key()`` — they deliberately never write
|
||||
``os.environ["HERMES_SESSION_KEY"]``. This proves the contextvar is
|
||||
sufficient post-fix, and would FAIL if contextvar routing regressed
|
||||
(the prior 'parallel' tests share one key and dual-set env+contextvar,
|
||||
so they cannot guard this invariant). Each session's dangerous command
|
||||
must land in its OWN gateway queue, and resolving one must not resolve
|
||||
the other.
|
||||
"""
|
||||
from tools.approval import (
|
||||
_gateway_queues,
|
||||
check_all_command_guards,
|
||||
register_gateway_notify,
|
||||
reset_current_session_key,
|
||||
resolve_gateway_approval,
|
||||
set_current_session_key,
|
||||
unregister_gateway_notify,
|
||||
)
|
||||
|
||||
# No HERMES_SESSION_KEY in os.environ at all — pure contextvar routing.
|
||||
os.environ.pop("HERMES_SESSION_KEY", None)
|
||||
os.environ["HERMES_GATEWAY_SESSION"] = "1"
|
||||
os.environ["HERMES_EXEC_ASK"] = "1"
|
||||
|
||||
register_gateway_notify("sess-A", lambda d: None)
|
||||
register_gateway_notify("sess-B", lambda d: None)
|
||||
|
||||
results = {"sess-A": None, "sess-B": None}
|
||||
|
||||
def worker(key, cmd):
|
||||
token = set_current_session_key(key)
|
||||
try:
|
||||
results[key] = check_all_command_guards(cmd, "local")
|
||||
finally:
|
||||
reset_current_session_key(token)
|
||||
|
||||
ta = threading.Thread(target=worker, args=("sess-A", "rm -rf /a-data"))
|
||||
tb = threading.Thread(target=worker, args=("sess-B", "rm -rf /b-data"))
|
||||
ta.start()
|
||||
tb.start()
|
||||
try:
|
||||
# Wait until both sessions have a pending approval in their queue.
|
||||
for _ in range(100):
|
||||
if (len(_gateway_queues.get("sess-A", [])) >= 1
|
||||
and len(_gateway_queues.get("sess-B", [])) >= 1):
|
||||
break
|
||||
time.sleep(0.05)
|
||||
|
||||
# Each command must be parked in its OWN session queue.
|
||||
qa = _gateway_queues.get("sess-A", [])
|
||||
qb = _gateway_queues.get("sess-B", [])
|
||||
assert len(qa) == 1, f"sess-A queue should hold 1, got {len(qa)}"
|
||||
assert len(qb) == 1, f"sess-B queue should hold 1, got {len(qb)}"
|
||||
|
||||
# Resolve ONLY sess-A; sess-B must stay blocked (no cross-leak).
|
||||
resolve_gateway_approval("sess-A", "once")
|
||||
ta.join(timeout=5)
|
||||
assert results["sess-A"] is not None
|
||||
assert results["sess-A"]["approved"] is True
|
||||
assert results["sess-B"] is None, "sess-B resolved by sess-A's approval (#24100)"
|
||||
assert len(_gateway_queues.get("sess-B", [])) == 1
|
||||
|
||||
# Now resolve sess-B independently.
|
||||
resolve_gateway_approval("sess-B", "once")
|
||||
tb.join(timeout=5)
|
||||
assert results["sess-B"] is not None
|
||||
assert results["sess-B"]["approved"] is True
|
||||
finally:
|
||||
resolve_gateway_approval("sess-A", "deny")
|
||||
resolve_gateway_approval("sess-B", "deny")
|
||||
ta.join(timeout=2)
|
||||
tb.join(timeout=2)
|
||||
os.environ.pop("HERMES_GATEWAY_SESSION", None)
|
||||
os.environ.pop("HERMES_EXEC_ASK", None)
|
||||
unregister_gateway_notify("sess-A")
|
||||
unregister_gateway_notify("sess-B")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue