feat(terminal): collapse subagent task_ids to shared container (#16177)

Before: delegate_task children each allocated their own terminal sandbox keyed by child task_id. Starting extra containers (or Modal sandboxes / Daytona workspaces) is expensive, and the subagent's work is invisible to the parent — files written by the child in its container don't exist in the parent's when the subagent returns. After: a single `_resolve_container_task_id` helper maps any tool-call task_id to "default" UNLESS an env override is registered for it. The parent agent and all delegate_task children therefore share one long-lived sandbox — installed packages, cwd, /workspace files, and /tmp scratch carry over freely between them. RL and benchmark environments (TerminalBench2, HermesSweEnv, ...) opt in to isolation via `register_task_env_overrides(task_id, {...})`; those task_ids survive the collapse and get their own sandbox, preserving the per-task Docker image behavior these benchmarks rely on. file_state / active-subagents registry / TUI events still key off the original child task_id, so the 'subagent wrote a file the parent read' warning and UI per-subagent panels keep working. Tradeoff: parallel delegate_task children (tasks=[...]) now share one bash/container. Concurrent cd, env-var mutations, and writes to the same path will collide. If that bites a specific workflow, the subagent can opt back into isolation via register_task_env_overrides. Applied at four lookup sites: - tools/terminal_tool.py terminal_tool() and get_active_env() - tools/file_tools.py _get_file_ops() and _get_live_tracking_cwd() - tools/code_execution_tool.py _get_or_create_environment() Docs: website/docs/user-guide/configuration.md updated to reflect the shared-container reality and document the RL/benchmark carve-out. Tests: tests/tools/test_shared_container_task_id.py (9 cases).
2026-04-29 01:31:41 +00:00 · 2026-04-26 11:55:02 -07:00 · 2026-04-26 11:55:02 -07:00 · 5b2c59559a
commit 5b2c59559a
parent 087e74d4d7
5 changed files with 159 additions and 8 deletions
--- a/tools/terminal_tool.py
+++ b/tools/terminal_tool.py
@ -803,6 +803,31 @@ def clear_task_env_overrides(task_id: str):
    """
    _task_env_overrides.pop(task_id, None)

+
+def _resolve_container_task_id(task_id: Optional[str]) -> str:
+    """
+    Map a tool-call ``task_id`` to the container/sandbox key used by
+    ``_active_environments``.
+
+    The top-level agent passes ``task_id=None`` and lands on ``"default"``.
+    ``delegate_task`` children pass their own subagent ID so that
+    file-state tracking, the active-subagents registry, and TUI events stay
+    distinct per child -- but we deliberately collapse that ID back to
+    ``"default"`` here so subagents share the parent's long-lived container
+    (one bash, one /workspace, one set of installed packages).
+
+    Exception: RL / benchmark environments (TerminalBench2, HermesSweEnv, ...)
+    call ``register_task_env_overrides(task_id, {...})`` to request a
+    per-task Docker/Modal image. When an override is registered for a
+    task_id, we honour it by returning the task_id unchanged -- those
+    rollouts need their own isolated sandbox, which is the whole point of
+    the override.
+    """
+    if task_id and task_id in _task_env_overrides:
+        return task_id
+    return "default"
+
+
 # Configuration from environment variables

 def _parse_env_var(name: str, default: str, converter=int, type_label: str = "integer"):
@ -1139,8 +1164,9 @@ def _stop_cleanup_thread():

 def get_active_env(task_id: str):
    """Return the active BaseEnvironment for *task_id*, or None."""
+    lookup = _resolve_container_task_id(task_id)
    with _env_lock:
-        return _active_environments.get(task_id)
+        return _active_environments.get(lookup) or _active_environments.get(task_id)


 def is_persistent_env(task_id: str) -> bool:
@ -1473,8 +1499,11 @@ def terminal_tool(
        config = _get_env_config()
        env_type = config["env_type"]

-        # Use task_id for environment isolation
-        effective_task_id = task_id or "default"
+        # Use task_id for environment isolation. By default all subagent
+        # task_ids collapse back to "default" so the top-level agent and
+        # every delegate_task child share one container; only task_ids with
+        # a registered env override (RL benchmarks) get isolated sandboxes.
+        effective_task_id = _resolve_container_task_id(task_id)

        # Check per-task overrides (set by environments like TerminalBench2Env)
        # before falling back to global env var config