From b0f44d3fad96f449a3389e08610f146c099d19b8 Mon Sep 17 00:00:00 2001 From: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> Date: Sat, 27 Jun 2026 04:00:43 +0530 Subject: [PATCH] fix(gateway): remove process-global HERMES_SESSION_KEY write that misroutes approval prompts across concurrent sessions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GatewayRunner._run_agent's run_sync() wrote the per-turn session key to the process-global os.environ["HERMES_SESSION_KEY"]. Because os.environ is shared across the whole process, concurrent gateway sessions (e.g. two Discord threads) clobbered each other's value. A tool worker thread whose approval contextvar was unset then fell back to os.environ via get_current_session_key() and read whichever session ran run_sync() last — routing "Command Approval Required" prompts to the wrong thread. Session routing is already concurrency-safe via contextvars: - gateway/session_context.py _SESSION_KEY (set in set_session_vars) - tools/approval.py _approval_session_key (set via set_current_session_key right before the agent runs, inherited by tool worker threads) The only non-test readers of HERMES_SESSION_KEY (tools/approval.py, tools/terminal_tool.py, tools/kanban_tools.py) all prefer the contextvar with os.environ as a mere fallback. CLI/cron/TUI set their own os.environ via separate export paths (e.g. the TUI parent exporting it into the agent subprocess), so removing this in-process write does not affect them. Adds regression tests asserting the resolver prefers the contextvar and does not leak a concurrent session's cleared/clobbered os.environ value. Closes #24100 Co-authored-by: Yosapol Jitrak --- gateway/run.py | 19 +- tests/gateway/test_approve_deny_commands.py | 226 ++++++++++++++++++++ 2 files changed, 241 insertions(+), 4 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index a84d3ca6cf7..441f83752f6 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -15864,10 +15864,21 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew # `_resolve_turn_agent_config(message, …)`. nonlocal message - # session_key is now set via contextvars in _set_session_env() - # (concurrency-safe). Keep os.environ as fallback for CLI/cron. - os.environ["HERMES_SESSION_KEY"] = session_key or "" - + # session_key is propagated via contextvars in _set_session_env() + # (_SESSION_KEY) and via set_current_session_key() (_approval_session_key) + # below — both concurrency-safe and inherited by tool worker threads. + # We deliberately do NOT write os.environ["HERMES_SESSION_KEY"] here: + # os.environ is process-global, so concurrent gateway sessions (e.g. + # two Discord threads) would clobber each other's value, and a tool + # thread whose contextvar is unset would fall back to os.environ and + # read the wrong session key — misrouting command-approval prompts to + # the wrong thread (#24100). The non-gateway surfaces don't depend on + # this write: CLI and cron bind the session via contextvars + # (set_current_session_key / session context), and only the TUI + # slash-worker *subprocess* exports HERMES_SESSION_KEY (from its own + # --session-key argv, a separate process) — so removing this in-process + # gateway write does not affect any of them. + # Map platform enum to the platform hint key the agent understands. # Platform.LOCAL ("local") maps to "cli"; others pass through as-is. platform_key = "cli" if source.platform == Platform.LOCAL else source.platform.value diff --git a/tests/gateway/test_approve_deny_commands.py b/tests/gateway/test_approve_deny_commands.py index 1c996b2baee..6d50a31be2e 100644 --- a/tests/gateway/test_approve_deny_commands.py +++ b/tests/gateway/test_approve_deny_commands.py @@ -647,3 +647,229 @@ class TestFallbackNoCallback: assert result["approved"] is False assert result.get("status") == "pending_approval" assert result.get("approval_pending") is True + + +# ------------------------------------------------------------------ +# Regression: cross-session approval routing isolation (#24100) +# ------------------------------------------------------------------ + + +class TestCrossSessionApprovalIsolation: + """Regression for #24100. + + The gateway used to write the per-turn session key to the + process-global ``os.environ["HERMES_SESSION_KEY"]`` inside + ``GatewayRunner._run_agent``. Because ``os.environ`` is process-global, + a concurrent gateway session (e.g. a second Discord thread) clobbered + the value, and a tool worker thread whose approval contextvar was unset + fell back to ``os.environ`` and read the *wrong* session key — routing + the "Command Approval Required" prompt to the wrong thread. + + The fix removes that ``os.environ`` write; routing is driven solely by + the ``_approval_session_key`` contextvar. These tests assert that a + worker thread carrying session A's contextvar resolves to session A + even when ``os.environ`` has been clobbered to session B. + """ + + def setup_method(self): + _clear_approval_state() + os.environ.pop("HERMES_SESSION_KEY", None) + + def teardown_method(self): + os.environ.pop("HERMES_SESSION_KEY", None) + + def test_contextvar_wins_over_clobbered_environ(self): + """get_current_session_key honors the contextvar, not stale env.""" + from tools.approval import ( + get_current_session_key, + reset_current_session_key, + set_current_session_key, + ) + + # Simulate a concurrent session B having written process-global env + # last (the "last writer wins" clobber that caused #24100). + os.environ["HERMES_SESSION_KEY"] = "session-B" + + token = set_current_session_key("session-A") + try: + # The worker running under session A's contextvar must resolve + # to session A, NOT the env-pinned session B. + assert get_current_session_key() == "session-A" + finally: + reset_current_session_key(token) + + def test_unset_contextvar_does_not_fall_back_to_clobbered_environ(self): + """The resolver must not leak a concurrent session's clobbered + ``os.environ`` value once the session-context vars are cleared (#24100). + + This exercises the resolver contract directly (not a separate worker + thread): while the session-context var holds a key, resolution returns + it; after ``clear_session_vars`` marks the vars explicitly cleared, the + ``os.environ`` fallback is suppressed and resolution must NOT return the + stale, process-global value a concurrent session wrote. Under the buggy + gateway that value would be another live session's key; here we assert + the resolver never surfaces it. + """ + from gateway.session_context import clear_session_vars, set_session_vars + from tools.approval import get_current_session_key + + # Simulate: concurrent session B was the last to clobber os.environ + # under the OLD buggy gateway. With the fix this write never happens, + # but we set it here to prove the resolver no longer trusts it once + # the session-context contextvars are explicitly cleared (as the + # gateway does in its finally block via clear_session_vars()). + os.environ["HERMES_SESSION_KEY"] = "session-B-stale" + + # The gateway explicitly clears its session contextvars at turn end; + # clear_session_vars sets them to "" to *suppress* the os.environ + # fallback. A bare worker thread therefore must NOT see session-B. + tokens = set_session_vars(session_key="session-A") + try: + assert get_current_session_key() == "session-A" + finally: + clear_session_vars(tokens) + + # After clearing, resolution returns the explicit empty/default — + # never the clobbered process-global value from session B. + assert get_current_session_key() != "session-B-stale", ( + "resolver leaked a concurrent session's clobbered os.environ " + "value — #24100 regression" + ) + + def test_approval_prompt_routes_to_originating_session(self): + """A dangerous command in session A's worker thread notifies + session A's callback, even though os.environ points at session B.""" + from tools.approval import ( + check_all_command_guards, + register_gateway_notify, + reset_current_session_key, + resolve_gateway_approval, + set_current_session_key, + unregister_gateway_notify, + ) + notified_a = [] + notified_b = [] + register_gateway_notify("session-A", lambda d: notified_a.append(d)) + register_gateway_notify("session-B", lambda d: notified_b.append(d)) + + # Concurrent session B clobbered the process-global env var last. + os.environ["HERMES_SESSION_KEY"] = "session-B" + os.environ["HERMES_GATEWAY_SESSION"] = "1" + os.environ["HERMES_EXEC_ASK"] = "1" + + result_holder = [None] + + def worker_a(): + # This worker belongs to session A — only its contextvar is set; + # it deliberately does NOT touch os.environ (mirroring the fixed + # gateway, which no longer writes HERMES_SESSION_KEY). + token = set_current_session_key("session-A") + try: + result_holder[0] = check_all_command_guards( + "rm -rf /important", "local" + ) + finally: + reset_current_session_key(token) + + t = threading.Thread(target=worker_a) + t.start() + try: + for _ in range(50): + if notified_a or notified_b: + break + time.sleep(0.05) + + # The prompt must land in session A (the originator), never B. + assert len(notified_a) == 1, "approval prompt did not route to session A" + assert len(notified_b) == 0, "approval prompt leaked to session B (#24100)" + assert "rm -rf /important" in notified_a[0]["command"] + + resolve_gateway_approval("session-A", "once") + t.join(timeout=5) + assert result_holder[0] is not None + assert result_holder[0]["approved"] is True + finally: + os.environ.pop("HERMES_GATEWAY_SESSION", None) + os.environ.pop("HERMES_EXEC_ASK", None) + unregister_gateway_notify("session-A") + unregister_gateway_notify("session-B") + + def test_two_concurrent_sessions_route_to_own_queue_contextvar_only(self): + """Cross-session isolation driven by contextvars ALONE (#24100). + + Two concurrent worker threads with DISTINCT session keys each set + only ``set_current_session_key()`` — they deliberately never write + ``os.environ["HERMES_SESSION_KEY"]``. This proves the contextvar is + sufficient post-fix, and would FAIL if contextvar routing regressed + (the prior 'parallel' tests share one key and dual-set env+contextvar, + so they cannot guard this invariant). Each session's dangerous command + must land in its OWN gateway queue, and resolving one must not resolve + the other. + """ + from tools.approval import ( + _gateway_queues, + check_all_command_guards, + register_gateway_notify, + reset_current_session_key, + resolve_gateway_approval, + set_current_session_key, + unregister_gateway_notify, + ) + + # No HERMES_SESSION_KEY in os.environ at all — pure contextvar routing. + os.environ.pop("HERMES_SESSION_KEY", None) + os.environ["HERMES_GATEWAY_SESSION"] = "1" + os.environ["HERMES_EXEC_ASK"] = "1" + + register_gateway_notify("sess-A", lambda d: None) + register_gateway_notify("sess-B", lambda d: None) + + results = {"sess-A": None, "sess-B": None} + + def worker(key, cmd): + token = set_current_session_key(key) + try: + results[key] = check_all_command_guards(cmd, "local") + finally: + reset_current_session_key(token) + + ta = threading.Thread(target=worker, args=("sess-A", "rm -rf /a-data")) + tb = threading.Thread(target=worker, args=("sess-B", "rm -rf /b-data")) + ta.start() + tb.start() + try: + # Wait until both sessions have a pending approval in their queue. + for _ in range(100): + if (len(_gateway_queues.get("sess-A", [])) >= 1 + and len(_gateway_queues.get("sess-B", [])) >= 1): + break + time.sleep(0.05) + + # Each command must be parked in its OWN session queue. + qa = _gateway_queues.get("sess-A", []) + qb = _gateway_queues.get("sess-B", []) + assert len(qa) == 1, f"sess-A queue should hold 1, got {len(qa)}" + assert len(qb) == 1, f"sess-B queue should hold 1, got {len(qb)}" + + # Resolve ONLY sess-A; sess-B must stay blocked (no cross-leak). + resolve_gateway_approval("sess-A", "once") + ta.join(timeout=5) + assert results["sess-A"] is not None + assert results["sess-A"]["approved"] is True + assert results["sess-B"] is None, "sess-B resolved by sess-A's approval (#24100)" + assert len(_gateway_queues.get("sess-B", [])) == 1 + + # Now resolve sess-B independently. + resolve_gateway_approval("sess-B", "once") + tb.join(timeout=5) + assert results["sess-B"] is not None + assert results["sess-B"]["approved"] is True + finally: + resolve_gateway_approval("sess-A", "deny") + resolve_gateway_approval("sess-B", "deny") + ta.join(timeout=2) + tb.join(timeout=2) + os.environ.pop("HERMES_GATEWAY_SESSION", None) + os.environ.pop("HERMES_EXEC_ASK", None) + unregister_gateway_notify("sess-A") + unregister_gateway_notify("sess-B")