mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 01:21:43 +00:00
feat(terminal): collapse subagent task_ids to shared container (#16177)
Before: delegate_task children each allocated their own terminal
sandbox keyed by child task_id. Starting extra containers (or Modal
sandboxes / Daytona workspaces) is expensive, and the subagent's work
is invisible to the parent — files written by the child in its
container don't exist in the parent's when the subagent returns.
After: a single `_resolve_container_task_id` helper maps any
tool-call task_id to "default" UNLESS an env override is registered
for it. The parent agent and all delegate_task children therefore
share one long-lived sandbox — installed packages, cwd, /workspace
files, and /tmp scratch carry over freely between them.
RL and benchmark environments (TerminalBench2, HermesSweEnv, ...)
opt in to isolation via `register_task_env_overrides(task_id, {...})`;
those task_ids survive the collapse and get their own sandbox,
preserving the per-task Docker image behavior these benchmarks rely on.
file_state / active-subagents registry / TUI events still key off the
original child task_id, so the 'subagent wrote a file the parent read'
warning and UI per-subagent panels keep working.
Tradeoff: parallel delegate_task children (tasks=[...]) now share one
bash/container. Concurrent cd, env-var mutations, and writes to the
same path will collide. If that bites a specific workflow, the
subagent can opt back into isolation via register_task_env_overrides.
Applied at four lookup sites:
- tools/terminal_tool.py terminal_tool() and get_active_env()
- tools/file_tools.py _get_file_ops() and _get_live_tracking_cwd()
- tools/code_execution_tool.py _get_or_create_environment()
Docs: website/docs/user-guide/configuration.md updated to reflect the
shared-container reality and document the RL/benchmark carve-out.
Tests: tests/tools/test_shared_container_task_id.py (9 cases).
This commit is contained in:
parent
087e74d4d7
commit
5b2c59559a
5 changed files with 159 additions and 8 deletions
107
tests/tools/test_shared_container_task_id.py
Normal file
107
tests/tools/test_shared_container_task_id.py
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
"""
|
||||
Regression tests for the shared-container task_id mapping.
|
||||
|
||||
The top-level agent and all delegate_task subagents share a single
|
||||
terminal sandbox keyed by ``"default"``. ``_resolve_container_task_id``
|
||||
is the sole gatekeeper for which tool-call task_ids go to the shared
|
||||
container vs. get their own isolated sandbox. RL / benchmark
|
||||
environments opt in to isolation by calling
|
||||
``register_task_env_overrides(task_id, {...})`` before the agent loop;
|
||||
every other task_id collapses back to ``"default"``.
|
||||
|
||||
If you change the collapse logic, update both the helper and these
|
||||
tests -- see `hermes-agent-dev` skill, "Why do subagents get their own
|
||||
containers?" section, and the Container lifecycle paragraph under
|
||||
Docker Backend in ``website/docs/user-guide/configuration.md``.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from tools import terminal_tool
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _clean_overrides():
|
||||
"""Ensure no stray overrides from other tests leak in."""
|
||||
before = dict(terminal_tool._task_env_overrides)
|
||||
terminal_tool._task_env_overrides.clear()
|
||||
yield
|
||||
terminal_tool._task_env_overrides.clear()
|
||||
terminal_tool._task_env_overrides.update(before)
|
||||
|
||||
|
||||
def test_none_task_id_maps_to_default():
|
||||
assert terminal_tool._resolve_container_task_id(None) == "default"
|
||||
|
||||
|
||||
def test_empty_task_id_maps_to_default():
|
||||
assert terminal_tool._resolve_container_task_id("") == "default"
|
||||
|
||||
|
||||
def test_literal_default_stays_default():
|
||||
assert terminal_tool._resolve_container_task_id("default") == "default"
|
||||
|
||||
|
||||
def test_subagent_task_id_collapses_to_default():
|
||||
# delegate_task constructs IDs like "subagent-<N>-<uuid_hex>"; these
|
||||
# should share the parent's container, not spin up their own.
|
||||
assert terminal_tool._resolve_container_task_id("subagent-0-deadbeef") == "default"
|
||||
assert terminal_tool._resolve_container_task_id("subagent-42-cafef00d") == "default"
|
||||
|
||||
|
||||
def test_arbitrary_session_id_collapses_to_default():
|
||||
# Session UUIDs or anything else without an override still collapse.
|
||||
assert terminal_tool._resolve_container_task_id("sess-123e4567-e89b-12d3") == "default"
|
||||
|
||||
|
||||
def test_rl_task_with_override_keeps_its_own_id():
|
||||
# RL / benchmark pattern: register a per-task image, then the task_id
|
||||
# must survive ``_resolve_container_task_id`` so the rollout lands in
|
||||
# its own sandbox.
|
||||
terminal_tool.register_task_env_overrides(
|
||||
"tb2-task-fix-git", {"docker_image": "tb2:fix-git", "cwd": "/app"}
|
||||
)
|
||||
try:
|
||||
assert (
|
||||
terminal_tool._resolve_container_task_id("tb2-task-fix-git")
|
||||
== "tb2-task-fix-git"
|
||||
)
|
||||
finally:
|
||||
terminal_tool.clear_task_env_overrides("tb2-task-fix-git")
|
||||
|
||||
|
||||
def test_cleared_override_collapses_again():
|
||||
terminal_tool.register_task_env_overrides("tb2-x", {"docker_image": "x:y"})
|
||||
assert terminal_tool._resolve_container_task_id("tb2-x") == "tb2-x"
|
||||
terminal_tool.clear_task_env_overrides("tb2-x")
|
||||
assert terminal_tool._resolve_container_task_id("tb2-x") == "default"
|
||||
|
||||
|
||||
def test_get_active_env_reads_shared_container_from_subagent_id():
|
||||
"""``get_active_env`` must see the shared ``"default"`` sandbox when
|
||||
called with a subagent's task_id, so the agent loop's turn-budget
|
||||
enforcement reads the real env (not None) during delegation."""
|
||||
sentinel = object()
|
||||
terminal_tool._active_environments["default"] = sentinel
|
||||
try:
|
||||
assert terminal_tool.get_active_env("subagent-7-cafe") is sentinel
|
||||
assert terminal_tool.get_active_env(None) is sentinel
|
||||
assert terminal_tool.get_active_env("default") is sentinel
|
||||
finally:
|
||||
terminal_tool._active_environments.pop("default", None)
|
||||
|
||||
|
||||
def test_get_active_env_honours_rl_override():
|
||||
rl_env = object()
|
||||
default_env = object()
|
||||
terminal_tool._active_environments["default"] = default_env
|
||||
terminal_tool._active_environments["rl-42"] = rl_env
|
||||
terminal_tool.register_task_env_overrides("rl-42", {"docker_image": "x"})
|
||||
try:
|
||||
# With an override registered, lookup returns the task's own env,
|
||||
# not the shared "default" one.
|
||||
assert terminal_tool.get_active_env("rl-42") is rl_env
|
||||
finally:
|
||||
terminal_tool.clear_task_env_overrides("rl-42")
|
||||
terminal_tool._active_environments.pop("default", None)
|
||||
terminal_tool._active_environments.pop("rl-42", None)
|
||||
Loading…
Add table
Add a link
Reference in a new issue