diff --git a/tests/tools/test_shared_container_task_id.py b/tests/tools/test_shared_container_task_id.py new file mode 100644 index 0000000000..ab599fa855 --- /dev/null +++ b/tests/tools/test_shared_container_task_id.py @@ -0,0 +1,107 @@ +""" +Regression tests for the shared-container task_id mapping. + +The top-level agent and all delegate_task subagents share a single +terminal sandbox keyed by ``"default"``. ``_resolve_container_task_id`` +is the sole gatekeeper for which tool-call task_ids go to the shared +container vs. get their own isolated sandbox. RL / benchmark +environments opt in to isolation by calling +``register_task_env_overrides(task_id, {...})`` before the agent loop; +every other task_id collapses back to ``"default"``. + +If you change the collapse logic, update both the helper and these +tests -- see `hermes-agent-dev` skill, "Why do subagents get their own +containers?" section, and the Container lifecycle paragraph under +Docker Backend in ``website/docs/user-guide/configuration.md``. +""" + +import pytest + +from tools import terminal_tool + + +@pytest.fixture(autouse=True) +def _clean_overrides(): + """Ensure no stray overrides from other tests leak in.""" + before = dict(terminal_tool._task_env_overrides) + terminal_tool._task_env_overrides.clear() + yield + terminal_tool._task_env_overrides.clear() + terminal_tool._task_env_overrides.update(before) + + +def test_none_task_id_maps_to_default(): + assert terminal_tool._resolve_container_task_id(None) == "default" + + +def test_empty_task_id_maps_to_default(): + assert terminal_tool._resolve_container_task_id("") == "default" + + +def test_literal_default_stays_default(): + assert terminal_tool._resolve_container_task_id("default") == "default" + + +def test_subagent_task_id_collapses_to_default(): + # delegate_task constructs IDs like "subagent--"; these + # should share the parent's container, not spin up their own. + assert terminal_tool._resolve_container_task_id("subagent-0-deadbeef") == "default" + assert terminal_tool._resolve_container_task_id("subagent-42-cafef00d") == "default" + + +def test_arbitrary_session_id_collapses_to_default(): + # Session UUIDs or anything else without an override still collapse. + assert terminal_tool._resolve_container_task_id("sess-123e4567-e89b-12d3") == "default" + + +def test_rl_task_with_override_keeps_its_own_id(): + # RL / benchmark pattern: register a per-task image, then the task_id + # must survive ``_resolve_container_task_id`` so the rollout lands in + # its own sandbox. + terminal_tool.register_task_env_overrides( + "tb2-task-fix-git", {"docker_image": "tb2:fix-git", "cwd": "/app"} + ) + try: + assert ( + terminal_tool._resolve_container_task_id("tb2-task-fix-git") + == "tb2-task-fix-git" + ) + finally: + terminal_tool.clear_task_env_overrides("tb2-task-fix-git") + + +def test_cleared_override_collapses_again(): + terminal_tool.register_task_env_overrides("tb2-x", {"docker_image": "x:y"}) + assert terminal_tool._resolve_container_task_id("tb2-x") == "tb2-x" + terminal_tool.clear_task_env_overrides("tb2-x") + assert terminal_tool._resolve_container_task_id("tb2-x") == "default" + + +def test_get_active_env_reads_shared_container_from_subagent_id(): + """``get_active_env`` must see the shared ``"default"`` sandbox when + called with a subagent's task_id, so the agent loop's turn-budget + enforcement reads the real env (not None) during delegation.""" + sentinel = object() + terminal_tool._active_environments["default"] = sentinel + try: + assert terminal_tool.get_active_env("subagent-7-cafe") is sentinel + assert terminal_tool.get_active_env(None) is sentinel + assert terminal_tool.get_active_env("default") is sentinel + finally: + terminal_tool._active_environments.pop("default", None) + + +def test_get_active_env_honours_rl_override(): + rl_env = object() + default_env = object() + terminal_tool._active_environments["default"] = default_env + terminal_tool._active_environments["rl-42"] = rl_env + terminal_tool.register_task_env_overrides("rl-42", {"docker_image": "x"}) + try: + # With an override registered, lookup returns the task's own env, + # not the shared "default" one. + assert terminal_tool.get_active_env("rl-42") is rl_env + finally: + terminal_tool.clear_task_env_overrides("rl-42") + terminal_tool._active_environments.pop("default", None) + terminal_tool._active_environments.pop("rl-42", None) diff --git a/tools/code_execution_tool.py b/tools/code_execution_tool.py index 96e21d0cb1..db706e6a4c 100644 --- a/tools/code_execution_tool.py +++ b/tools/code_execution_tool.py @@ -440,9 +440,10 @@ def _get_or_create_env(task_id: str): _active_environments, _env_lock, _create_environment, _get_env_config, _last_activity, _start_cleanup_thread, _creation_locks, _creation_locks_lock, _task_env_overrides, + _resolve_container_task_id, ) - effective_task_id = task_id or "default" + effective_task_id = _resolve_container_task_id(task_id) # Fast path: environment already exists with _env_lock: diff --git a/tools/file_tools.py b/tools/file_tools.py index 609506c05e..2e1d3875c2 100644 --- a/tools/file_tools.py +++ b/tools/file_tools.py @@ -88,8 +88,14 @@ def _resolve_path(filepath: str, task_id: str = "default") -> Path: def _get_live_tracking_cwd(task_id: str = "default") -> str | None: """Return the task's live terminal cwd for bookkeeping when available.""" + try: + from tools.terminal_tool import _resolve_container_task_id + container_key = _resolve_container_task_id(task_id) + except Exception: + container_key = task_id + with _file_ops_lock: - cached = _file_ops_cache.get(task_id) + cached = _file_ops_cache.get(container_key) or _file_ops_cache.get(task_id) if cached is not None: live_cwd = getattr(getattr(cached, "env", None), "cwd", None) or getattr( cached, "cwd", None @@ -101,7 +107,7 @@ def _get_live_tracking_cwd(task_id: str = "default") -> str | None: from tools.terminal_tool import _active_environments, _env_lock with _env_lock: - env = _active_environments.get(task_id) + env = _active_environments.get(container_key) or _active_environments.get(task_id) live_cwd = getattr(env, "cwd", None) if env is not None else None if live_cwd: return live_cwd @@ -261,15 +267,23 @@ def _get_file_ops(task_id: str = "default") -> ShellFileOperations: Thread-safe: uses the same per-task creation locks as terminal_tool to prevent duplicate sandbox creation from concurrent tool calls. + + Note: subagent task_ids are collapsed to "default" via + ``_resolve_container_task_id`` so delegate_task children share the + parent's container and its cached file_ops. RL/benchmark task_ids with + a registered env override keep their isolation. """ from tools.terminal_tool import ( _active_environments, _env_lock, _create_environment, _get_env_config, _last_activity, _start_cleanup_thread, _creation_locks, _creation_locks_lock, + _resolve_container_task_id, ) import time + task_id = _resolve_container_task_id(task_id) + # Fast path: check cache -- but also verify the underlying environment # is still alive (it may have been killed by the cleanup thread). with _file_ops_lock: diff --git a/tools/terminal_tool.py b/tools/terminal_tool.py index b0f81b8868..a2e8a21898 100644 --- a/tools/terminal_tool.py +++ b/tools/terminal_tool.py @@ -803,6 +803,31 @@ def clear_task_env_overrides(task_id: str): """ _task_env_overrides.pop(task_id, None) + +def _resolve_container_task_id(task_id: Optional[str]) -> str: + """ + Map a tool-call ``task_id`` to the container/sandbox key used by + ``_active_environments``. + + The top-level agent passes ``task_id=None`` and lands on ``"default"``. + ``delegate_task`` children pass their own subagent ID so that + file-state tracking, the active-subagents registry, and TUI events stay + distinct per child -- but we deliberately collapse that ID back to + ``"default"`` here so subagents share the parent's long-lived container + (one bash, one /workspace, one set of installed packages). + + Exception: RL / benchmark environments (TerminalBench2, HermesSweEnv, ...) + call ``register_task_env_overrides(task_id, {...})`` to request a + per-task Docker/Modal image. When an override is registered for a + task_id, we honour it by returning the task_id unchanged -- those + rollouts need their own isolated sandbox, which is the whole point of + the override. + """ + if task_id and task_id in _task_env_overrides: + return task_id + return "default" + + # Configuration from environment variables def _parse_env_var(name: str, default: str, converter=int, type_label: str = "integer"): @@ -1139,8 +1164,9 @@ def _stop_cleanup_thread(): def get_active_env(task_id: str): """Return the active BaseEnvironment for *task_id*, or None.""" + lookup = _resolve_container_task_id(task_id) with _env_lock: - return _active_environments.get(task_id) + return _active_environments.get(lookup) or _active_environments.get(task_id) def is_persistent_env(task_id: str) -> bool: @@ -1473,8 +1499,11 @@ def terminal_tool( config = _get_env_config() env_type = config["env_type"] - # Use task_id for environment isolation - effective_task_id = task_id or "default" + # Use task_id for environment isolation. By default all subagent + # task_ids collapse back to "default" so the top-level agent and + # every delegate_task child share one container; only task_ids with + # a registered env override (RL benchmarks) get isolated sandboxes. + effective_task_id = _resolve_container_task_id(task_id) # Check per-task overrides (set by environments like TerminalBench2Env) # before falling back to global env var config diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index ac48e9f884..61eed114e0 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -146,9 +146,9 @@ terminal: **Requirements:** Docker Desktop or Docker Engine installed and running. Hermes probes `$PATH` plus common macOS install locations (`/usr/local/bin/docker`, `/opt/homebrew/bin/docker`, Docker Desktop app bundle). -**Container lifecycle:** Hermes reuses a single long-lived container (`docker run -d ... sleep 2h`) for every terminal and file-tool call made by the top-level agent, across sessions, `/new`, and `/reset`, for the lifetime of the Hermes process. Commands run via `docker exec` with a login shell, so working-directory changes, installed packages, and files in `/workspace` all persist from one tool call to the next. The container is stopped and removed on Hermes shutdown (or when the idle-sweep reclaims it). +**Container lifecycle:** Hermes reuses a single long-lived container (`docker run -d ... sleep 2h`) for every terminal and file-tool call, across sessions, `/new`, `/reset`, and `delegate_task` subagents, for the lifetime of the Hermes process. Commands run via `docker exec` with a login shell, so working-directory changes, installed packages, and files in `/workspace` all persist from one tool call to the next. The container is stopped and removed on Hermes shutdown (or when the idle-sweep reclaims it). -Subagents (`delegate_task`) and RL rollouts get their own isolated containers keyed by `task_id` — only the top-level agent shares the `default` container. +Parallel subagents spawned via `delegate_task(tasks=[...])` share this one container — concurrent `cd`, env mutations, and writes to the same path will collide. If a subagent needs an isolated sandbox, it must register a per-task image override via `register_task_env_overrides()`, which RL and benchmark environments (TerminalBench2, HermesSweEnv, etc.) do automatically for their per-task Docker images. **Security hardening:** - `--cap-drop ALL` with only `DAC_OVERRIDE`, `CHOWN`, `FOWNER` added back