From 98d945f6de1815ca0d7cc9772c0986291c617366 Mon Sep 17 00:00:00 2001 From: Shannon Sands Date: Tue, 10 Feb 2026 02:26:31 +0000 Subject: [PATCH] Add sandbox pool support to HermesAgentBaseEnv Added directly to HermesAgentBaseEnv (no subclass needed): Config fields: - tool_pool_mode: 'default' (terminal tool), 'nomad', or 'modal' - Full Nomad settings: nomad_address, sandbox_job_id, slots_per_container, etc. - Full Modal settings: modal_image, modal_gpu, modal_slots_per_sandbox, etc. - Shared: allow_network, require_sandbox, purge_job_on_start/shutdown Methods: - _start_sandbox_backend() / _stop_sandbox_backend() - lifecycle - setup_trajectory_workspace() - optional hook for workspace prep - verify_and_score_trajectory() - optional hook for in-sandbox verification - env_manager() / process_manager() - lifecycle cleanup When tool_pool_mode='default': everything works as before (terminal tool) When tool_pool_mode='nomad'/'modal': activates sandbox pool from atropos/backends/ --- environments/hermes_base_env.py | 153 ++++++++++++++++++++++++++++++++ memory-bank/activeContext.md | 132 +++++++++------------------ 2 files changed, 197 insertions(+), 88 deletions(-) diff --git a/environments/hermes_base_env.py b/environments/hermes_base_env.py index 861d88af918..3aeb7ef7b15 100644 --- a/environments/hermes_base_env.py +++ b/environments/hermes_base_env.py @@ -140,6 +140,48 @@ class HermesAgentEnvConfig(BaseEnvConfig): "Options: hermes, mistral, llama3_json, qwen, deepseek_v3, etc.", ) + # --- Sandbox pool mode (optional, for scaled environments) --- + tool_pool_mode: str = Field( + default="default", + description="Tool execution mode: 'default' (terminal tool per task_id), " + "'nomad' (slot pool via Nomad/Docker/Singularity), or 'modal' (Modal sandbox pool).", + ) + + # Sandbox pool: shared settings + allow_network: bool = Field(default=True, description="Whether sandbox bash commands may access the network.") + require_sandbox: bool = Field(default=False, description="Fail closed if bubblewrap is unavailable.") + purge_job_on_start: bool = Field(default=False, description="Purge existing sandbox job on startup.") + purge_job_on_shutdown: bool = Field(default=True, description="Purge sandbox job on shutdown.") + acquire_timeout_s: float = Field(default=30.0, description="Slot acquisition timeout (seconds).") + + # Sandbox pool: Nomad settings + nomad_address: str = Field(default="http://localhost:4646", description="Nomad API address.") + sandbox_job_id: str = Field(default="atropos-sandbox", description="Nomad job id for sandbox containers.") + sandbox_image: str = Field(default="atropos-sandbox:local", description="Docker image for sandbox containers.") + slots_per_container: int = Field(default=10, description="Nomad: slots per container.") + min_containers: int = Field(default=1, description="Nomad: minimum containers.") + max_containers: int = Field(default=10, description="Nomad: maximum containers.") + privileged: bool = Field(default=False, description="Nomad: run container privileged.") + driver: str = Field(default="docker", description="Nomad task driver: 'docker' or 'singularity'.") + singularity_image: Optional[str] = Field(default=None, description="Path to .sif file for Singularity driver.") + + # Sandbox pool: Modal settings + modal_app_name: str = Field(default="atropos-sandbox", description="Modal app name prefix.") + modal_image: str = Field(default="python:3.11", description="Modal: container image.") + modal_gpu: Optional[str] = Field(default=None, description="Modal: GPU type (None, 'T4', 'A10G', 'A100', 'H100').") + modal_cpu: float = Field(default=1.0, description="Modal: CPU cores.") + modal_memory: int = Field(default=2048, description="Modal: memory in MB.") + modal_slots_per_sandbox: int = Field(default=10, description="Modal: slots per sandbox.") + modal_min_sandboxes: int = Field(default=1, description="Modal: minimum sandboxes.") + modal_max_sandboxes: int = Field(default=5, description="Modal: maximum sandboxes.") + modal_idle_timeout: int = Field(default=120, description="Modal: idle timeout (seconds).") + modal_max_lifetime: int = Field(default=3600, description="Modal: max sandbox lifetime (seconds).") + modal_acquire_timeout: float = Field(default=60.0, description="Modal: slot acquisition timeout (seconds).") + modal_execution_timeout: float = Field(default=30.0, description="Modal: command execution timeout (seconds).") + modal_secrets: str = Field(default="", description="Modal: comma-separated Modal Secret names.") + modal_env_vars: str = Field(default="", description="Modal: semicolon-separated KEY=VALUE pairs.") + modal_workspace_base: str = Field(default="/data", description="Modal: workspace base directory.") + class HermesAgentBaseEnv(BaseEnv): """ @@ -186,6 +228,9 @@ class HermesAgentBaseEnv(BaseEnv): # Tool error tracking for wandb logging self._tool_error_buffer: List[Dict[str, Any]] = [] + # Sandbox pool backend (only used when tool_pool_mode != "default") + self._sandbox_backend = None + # ========================================================================= # Toolset resolution (per-group) # ========================================================================= @@ -242,6 +287,114 @@ class HermesAgentBaseEnv(BaseEnv): from atroposlib.envs.server_handling.openai_server import OpenAIServer return not isinstance(server, OpenAIServer) + # ========================================================================= + # Sandbox pool backend (tool_pool_mode != "default") + # ========================================================================= + + async def _start_sandbox_backend(self) -> None: + """Start the sandbox pool backend if tool_pool_mode is not 'default'.""" + if self.config.tool_pool_mode == "default": + return + + from atropos.backends import create_tool_backend + logger.info("Starting sandbox backend (mode=%s)", self.config.tool_pool_mode) + self._sandbox_backend = create_tool_backend(self.config) + await self._sandbox_backend.start() + logger.info("Sandbox backend started") + + async def _stop_sandbox_backend(self) -> None: + """Stop the sandbox pool backend.""" + if self._sandbox_backend is not None: + logger.info("Stopping sandbox backend") + await self._sandbox_backend.stop( + purge=bool(self.config.purge_job_on_shutdown) + ) + self._sandbox_backend = None + + # ========================================================================= + # Optional hooks for sandbox environments + # ========================================================================= + + async def setup_trajectory_workspace( + self, + item: Item, + *, + trajectory_id: str, + exec_tool, + ) -> Dict[str, Any]: + """ + Optional hook: prepare the sandbox workspace before the agent starts. + + Override in subclasses for environments that need workspace setup + (e.g., git clone, worktree creation, dependency installation). + + Args: + item: The dataset item being rolled out + trajectory_id: Unique ID for this trajectory + exec_tool: Callable to execute tool calls in the sandbox + + Returns: + Dict of workspace metadata (passed to verify_and_score_trajectory) + """ + return {} + + async def verify_and_score_trajectory( + self, + item: Item, + result: AgentResult, + *, + trajectory_id: str, + exec_tool, + workspace_meta: Optional[Dict[str, Any]] = None, + ) -> Tuple[float, Dict[str, Any]]: + """ + Optional hook: run in-sandbox verification before scoring. + + Override in subclasses for environments that need to verify results + inside the sandbox (e.g., run pytest, check file contents). + + Default: calls compute_reward() with ToolContext. + + Args: + item: The dataset item + result: The agent's rollout result + trajectory_id: Unique ID for this trajectory + exec_tool: Callable to execute tool calls in the sandbox + workspace_meta: Metadata from setup_trajectory_workspace + + Returns: + Tuple of (reward, metadata_dict) + """ + ctx = ToolContext(trajectory_id) + try: + reward = await self.compute_reward(item, result, ctx) + except Exception as e: + logger.error("compute_reward failed: %s", e) + reward = 0.0 + finally: + ctx.cleanup() + return reward, {} + + # ========================================================================= + # Lifecycle hooks for env_manager/process_manager cleanup + # ========================================================================= + + async def env_manager(self): + """Start sandbox backend, run env, then clean up.""" + await self._start_sandbox_backend() + try: + return await super().env_manager() + finally: + await self._stop_sandbox_backend() + + async def process_manager(self): + """Start sandbox backend, run process, then clean up.""" + await self._start_sandbox_backend() + try: + return await super().process_manager() + finally: + await self._stop_sandbox_backend() + # ========================================================================= # Core Atropos integration # ========================================================================= diff --git a/memory-bank/activeContext.md b/memory-bank/activeContext.md index 7a6d9b24ed9..ab1e2f16700 100644 --- a/memory-bank/activeContext.md +++ b/memory-bank/activeContext.md @@ -1,99 +1,55 @@ # Active Context ## Current Focus -Consolidating the two Atropos environment systems and fixing tool calling to use proper OpenAI-spec approach instead of ICL. +Adding sandbox pool support directly to `HermesAgentBaseEnv` so that `tool_pool_mode=modal/nomad` works alongside the default terminal-tool approach. -## PR Feedback from Lead Dev (Feb 10, 2026) +## Implementation Plan (Feb 10, 2026) -The PR was rejected because our approach has three fundamental issues: - -### Issue 1: ManagedServer doesn't pass `tools={}` to `apply_chat_template()` -- When using Phase 2 (VLLM/SGLang for RL training), `ManagedServer` needs to pass tools to `tokenizer.apply_chat_template(tools=...)` -- This makes the system prompt include tool definitions the way models were trained to expect -- **Fix**: Atropos PR #366 adds `tool_call_parser` support to ManagedServer (branch: `tool_call_support`) - -### Issue 2: ICL prompt vs proper tool calling -- Our code embeds tools as XML in the system prompt (`...`) -- Proper approach: pass `tools=` parameter in `chat_completion()` calls and let the tokenizer's chat template handle formatting -- All Hermes datasets train on the proper format, not ICL - -### Issue 3: Only Hermes `` parser, no multi-model support -- Our code only handles Hermes-style `` XML parsing -- Proper approach: parser registry supporting 11+ model families (hermes, qwen, deepseek, llama, mistral, etc.) - -## Architecture: What Exists Now (Two Parallel Systems) - -### `environments/` (Teknium's proper approach) ✅ CORRECT -``` -environments/ -├── agent_loop.py ← Uses tools= in chat_completion() (OpenAI spec) -├── hermes_base_env.py ← Phase 1 (OpenAI) + Phase 2 (ManagedServer + parser) -├── tool_context.py ← ToolContext for reward functions -├── tool_call_parsers/ ← 11 model parsers (hermes, qwen, deepseek, llama, etc.) -│ ├── __init__.py ← Registry with get_parser(), register_parser() -│ ├── hermes_parser.py -│ ├── qwen_parser.py -│ ├── deepseek_v3_parser.py -│ ├── llama_parser.py -│ ├── mistral_parser.py -│ └── ... (11 total) -├── terminal_test_env.py ← Working example: file creation tasks -├── hermes_swe_env.py ← SWE environment -└── patches.py ← Async-safe monkey patches +### Goal +The command should work: +```bash +python environments/swe_smith_oracle_env.py process \ + --env.tool_pool_mode modal \ + --env.modal_image python:3.11 ``` -**How it works correctly:** -1. `HermesAgentLoop.run()` passes `tools=self.tool_schemas` to `chat_completion()` -2. ManagedServer passes tools to `tokenizer.apply_chat_template(tools=...)` -3. Parser registry reconstructs `tool_calls` from raw model output -4. Tool execution uses hermes-agent's `handle_function_call()` from `model_tools.py` +### Changes to `environments/hermes_base_env.py`: -### `atropos/` (Our sandbox-optimized code) - PARTIALLY REDUNDANT -``` -atropos/ -├── agent/atropos_agent.py ← ICL-based agent (REDUNDANT with agent_loop.py) -├── envs/agent_env.py ← Environment with sandbox backends (PARTIALLY REDUNDANT) -├── envs/swe_smith_oracle_env.py ← SWE env using sandbox (KEEP - port to new base) -├── backends/ ← Sandbox backends (KEEP - valuable infrastructure) -│ ├── modal_backend.py ← Modal sandbox pool -│ ├── nomad_backend.py ← Nomad/Docker/Singularity -│ └── base.py ← ToolBackend protocol -├── slots/ ← Slot multiplexing (KEEP) -├── nomad/ ← Nomad client (KEEP) -├── tools/ ← Sandbox tool registry (PARTIALLY REDUNDANT) -└── sandbox_server.py ← HTTP server in containers (KEEP) +**1. Add config fields to `HermesAgentEnvConfig`:** +- `tool_pool_mode: str = "default"` — "default" (terminal tool), "nomad", or "modal" +- Nomad fields: `nomad_address`, `sandbox_job_id`, `sandbox_image`, `slots_per_container`, etc. +- Modal fields: `modal_app_name`, `modal_image`, `modal_gpu`, `modal_slots_per_sandbox`, etc. +- Shared: `allow_network`, `require_sandbox`, `purge_job_on_start`, `purge_job_on_shutdown` + +**2. Add methods to `HermesAgentBaseEnv`:** +- `_start_sandbox_backend()` / `_stop_sandbox_backend()` — lifecycle management +- `setup_trajectory_workspace(item, exec_tool, trajectory_id)` → optional hook (no-op default) +- `verify_and_score_trajectory(item, result, exec_tool)` → optional hook (calls compute_reward by default) + +**3. Modify `collect_trajectory()`:** +- When `tool_pool_mode == "default"`: existing behavior (terminal tool handles isolation) +- When `tool_pool_mode in ("nomad", "modal")`: acquire slot → run agent with sandbox-backed tools → verify → release + +**4. Port SWE env to `environments/`:** +- Move/rewrite `swe_smith_oracle_env.py` to subclass `HermesAgentBaseEnv` +- Override `setup_trajectory_workspace()` (git clone/worktree) +- Override `verify_and_score_trajectory()` (pytest verification) + +### Key Imports +```python +from atropos.backends import create_tool_backend # Nomad/Modal backends +from atropos.backends.base import ToolBackend +from atropos.slots.executor import ExecutionResult ``` -## Plan: Consolidate into `environments/` +### What's Already Working +- ✅ atroposlib with tool_call_support (ManagedServer has tool_call_parser) +- ✅ GSM8k agent env with HermesAgentBaseEnv (Phase 1 tested, process mode) +- ✅ mini-swe-agent installed (terminal tool available) +- ✅ Modal backend (tested, working with sandboxes) +- ✅ Nomad/Singularity backends (tested, working) +- ✅ Tool call parsers (11+ models) -### What to KEEP from `atropos/`: -- `backends/` - Modal, Nomad, Singularity backends (valuable infrastructure for scale) -- `slots/` - Slot multiplexing -- `nomad/` - Nomad client -- `sandbox_server.py` - Container HTTP server -- `Dockerfile` - Sandbox container image - -### What to REMOVE/REPLACE: -- `atropos/agent/atropos_agent.py` → replaced by `environments/agent_loop.py` -- `atropos/envs/agent_env.py` → functionality merged into `environments/hermes_base_env.py` -- `atropos/tools/` → replaced by `model_tools.py` + `tools/` (hermes-agent's standard tools) - -### What to CREATE: -- `environments/gsm8k_agent_env.py` → GSM8k with tool calling, subclasses `HermesAgentBaseEnv` -- Update `environments/hermes_base_env.py` to optionally use sandbox backends (Nomad/Modal) for terminal isolation when needed for scale - -### Steps: -1. Install atropos `tool_call_support` branch (PR #366) -2. Create `environments/gsm8k_agent_env.py` using `HermesAgentBaseEnv` -3. Port `swe_smith_oracle_env.py` to use `HermesAgentBaseEnv` -4. Make sandbox backends accessible from `HermesAgentBaseEnv` (terminal_backend config) -5. Remove redundant `atropos/agent/` and `atropos/envs/agent_env.py` -6. Clean up `atropos/tools/` (keep only sandbox-specific tools) -7. Update tinker-atropos gsm8k env to use proper base class -8. Test everything end-to-end - -## Previous Completed Work -- Modal backend integration (Feb 8) - KEEP backends, update integration point -- Main branch merge (Feb 9) - completed -- Singularity/Apptainer (Feb 6) - KEEP -- Memory Bank initialized (Feb 5) +### What Blocks +- Tinker billing (402 error) — can't test Phase 2 training yet +- No VLLM on this machine — can't test ManagedServer locally