fix(terminal): read cwd overrides under raw task_id after container collapse

PR #41822 collapsed CWD-only overrides to the shared 'default' container via _resolve_container_task_id, but three call sites kept routing the *env/override lookup* through that collapsed id: - the foreground exec path read _task_env_overrides[effective_task_id], yet register_task_env_overrides writes under the raw task_id, so a CWD-only override's cwd was silently dropped (env spun up at the wrong root, exit 126); - the get-or-create env lookup keyed solely on effective_task_id, so an env cached under the raw task_id was missed and duplicated; - register_task_env_overrides synced the new cwd onto the env under the collapsed id, missing a live env cached under the raw task_id. Container *identity* still collapses to 'default' (sharing preserved); only the per-session env/override *lookup* now prefers the raw task_id and falls back to the collapsed id. Fixes the 3 regressions in test_terminal_task_cwd.py left red by #41822.
2026-06-14 09:11:54 +00:00 · 2026-06-07 23:36:16 -07:00 · 2026-06-07 23:36:16 -07:00 · 329c33dac3
commit 329c33dac3
parent d759c13c09
1 changed files with 39 additions and 9 deletions
--- a/tools/terminal_tool.py
+++ b/tools/terminal_tool.py
@ -972,9 +972,14 @@ def register_task_env_overrides(task_id: str, overrides: Dict[str, Any]):
    # while letting an explicit ACP cwd change win, as the client expects.
    new_cwd = overrides.get("cwd")
    if isinstance(new_cwd, str) and new_cwd.strip():
+        # The live env is cached under the raw task_id for per-session surfaces
+        # (ACP/gateway/dashboard) and under the collapsed container id for
+        # isolation-keyed rollouts. Try the raw id first, then the container id,
+        # so a CWD-only override (which collapses to "default") still finds and
+        # updates the originating session's env.
        container_id = _resolve_container_task_id(task_id)
        with _env_lock:
-            env = _active_environments.get(container_id)
+            env = _active_environments.get(task_id) or _active_environments.get(container_id)
        if env is not None and getattr(env, "cwd", None) is not None:
            env.cwd = new_cwd

@ -1848,8 +1853,20 @@ def terminal_tool(
        effective_task_id = _resolve_container_task_id(task_id)

        # Check per-task overrides (set by environments like TerminalBench2Env)
-        # before falling back to global env var config
-        overrides = _task_env_overrides.get(effective_task_id, {})
+        # before falling back to global env var config.
+        #
+        # Overrides are keyed by the *raw* task_id (that's the key
+        # ``register_task_env_overrides`` writes under), NOT by the collapsed
+        # container id. A CWD-only override collapses ``effective_task_id`` to
+        # ``"default"`` for container sharing, but its cwd must still be read
+        # back here under the originating task_id, or the override is silently
+        # dropped. Fall back to the collapsed id so isolation-keyed RL/benchmark
+        # overrides (registered under an id that equals their container id) keep
+        # resolving as before.
+        overrides = (
+            (_task_env_overrides.get(task_id) if task_id else None)
+            or _task_env_overrides.get(effective_task_id, {})
+        )
        
        # Select image based on env type, with per-task override support
        if env_type == "docker":
@ -1898,9 +1915,18 @@ def terminal_tool(
        # task_id wait for the first one to finish creating the sandbox,
        # instead of each creating their own (wasting Modal resources).
        with _env_lock:
-            if effective_task_id in _active_environments:
-                _last_activity[effective_task_id] = time.time()
-                env = _active_environments[effective_task_id]
+            # Prefer the collapsed container id, but fall back to an env cached
+            # under the raw task_id. Per-session surfaces (ACP/gateway/dashboard)
+            # with a CWD-only override collapse to "default" for container
+            # sharing, yet an env may already be cached under the originating
+            # task_id; honor it instead of spawning a duplicate.
+            _existing_key = (
+                effective_task_id if effective_task_id in _active_environments
+                else (task_id if task_id and task_id in _active_environments else None)
+            )
+            if _existing_key is not None:
+                _last_activity[_existing_key] = time.time()
+                env = _active_environments[_existing_key]
                needs_creation = False
            else:
                needs_creation = True
@ -1915,9 +1941,13 @@ def terminal_tool(
            with task_lock:
                # Double-check after acquiring the per-task lock
                with _env_lock:
-                    if effective_task_id in _active_environments:
-                        _last_activity[effective_task_id] = time.time()
-                        env = _active_environments[effective_task_id]
+                    _existing_key = (
+                        effective_task_id if effective_task_id in _active_environments
+                        else (task_id if task_id and task_id in _active_environments else None)
+                    )
+                    if _existing_key is not None:
+                        _last_activity[_existing_key] = time.time()
+                        env = _active_environments[_existing_key]
                        needs_creation = False

                if needs_creation: