mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-14 04:02:26 +00:00
modal backend working ok, merged in modal-integrations
This commit is contained in:
parent
0bc914b00c
commit
6be8cdeeca
5 changed files with 187 additions and 81 deletions
|
|
@ -10,7 +10,10 @@ from ..slots.executor import ExecutionResult
|
||||||
from ..slots.slot import Slot, SlotState
|
from ..slots.slot import Slot, SlotState
|
||||||
from .base import ToolBackend
|
from .base import ToolBackend
|
||||||
|
|
||||||
import yaml
|
try:
|
||||||
|
import yaml
|
||||||
|
except ImportError:
|
||||||
|
yaml = None # type: ignore[assignment]
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ModalSandboxConfig:
|
class ModalSandboxConfig:
|
||||||
|
|
|
||||||
|
|
@ -71,11 +71,22 @@ class AgentEnvConfig(BaseEnvConfig):
|
||||||
description="Path to .sif file for Singularity driver (required if driver='singularity')",
|
description="Path to .sif file for Singularity driver (required if driver='singularity')",
|
||||||
)
|
)
|
||||||
|
|
||||||
# modal mode settings (stub; implementation pending)
|
# Modal mode settings
|
||||||
modal_app_name: str = Field(default="atropos-sandbox", description="Modal app name (stub)")
|
modal_app_name: str = Field(default="atropos-sandbox", description="Modal app name prefix")
|
||||||
modal_function_name: str = Field(default="sandbox_server", description="Modal function/actor name (stub)")
|
modal_image: str = Field(default="python:3.11", description="Modal: container image")
|
||||||
modal_volume_name: Optional[str] = Field(default=None, description="Modal Volume name for persistent storage (stub)")
|
modal_gpu: Optional[str] = Field(default=None, description="Modal: GPU type (None, 'T4', 'A10G', 'A100', 'H100')")
|
||||||
modal_volume_mount_path: str = Field(default="/data", description="Modal Volume mount path (stub)")
|
modal_cpu: float = Field(default=1.0, description="Modal: CPU cores")
|
||||||
|
modal_memory: int = Field(default=2048, description="Modal: memory in MB")
|
||||||
|
modal_slots_per_sandbox: int = Field(default=10, description="Modal: slots per sandbox")
|
||||||
|
modal_min_sandboxes: int = Field(default=1, description="Modal: minimum sandboxes")
|
||||||
|
modal_max_sandboxes: int = Field(default=5, description="Modal: maximum sandboxes")
|
||||||
|
modal_idle_timeout: int = Field(default=120, description="Modal: server-side idle timeout (seconds)")
|
||||||
|
modal_max_lifetime: int = Field(default=3600, description="Modal: max sandbox lifetime (seconds)")
|
||||||
|
modal_acquire_timeout: float = Field(default=60.0, description="Modal: slot acquisition timeout (seconds)")
|
||||||
|
modal_execution_timeout: float = Field(default=30.0, description="Modal: default command execution timeout (seconds)")
|
||||||
|
modal_secrets: str = Field(default="", description="Modal: comma-separated list of Modal Secret names")
|
||||||
|
modal_env_vars: str = Field(default="", description="Modal: semicolon-separated KEY=VALUE pairs for env vars")
|
||||||
|
modal_workspace_base: str = Field(default="/data", description="Modal: workspace base directory in sandbox")
|
||||||
|
|
||||||
# basic agent defaults
|
# basic agent defaults
|
||||||
agent_max_steps: int = Field(default=50, description="Max ReACT steps per trajectory")
|
agent_max_steps: int = Field(default=50, description="Max ReACT steps per trajectory")
|
||||||
|
|
|
||||||
|
|
@ -1,62 +1,83 @@
|
||||||
# Active Context
|
# Active Context
|
||||||
|
|
||||||
## Current Focus
|
## Current Focus
|
||||||
Singularity/Apptainer integration for HPC environments has been **COMPLETED AND TESTED**.
|
Modal backend integration has been **MERGED AND UPDATED** from the `modal-integration` branch.
|
||||||
|
|
||||||
## Recently Completed (Feb 6, 2026)
|
## Recently Completed (Feb 8, 2026)
|
||||||
|
|
||||||
|
### Modal Backend Integration - MERGED & WORKING
|
||||||
|
Merged the `modal-integration` branch into `atropos-integrations` and fixed integration issues.
|
||||||
|
|
||||||
|
**What was merged (from another dev's branch):**
|
||||||
|
1. `atropos/backends/modal_backend.py` - Complete Modal backend with:
|
||||||
|
- `ModalSandboxConfig` - Unified config with YAML profiles, env vars, and AgentEnv config loading
|
||||||
|
- `_ModalSandboxWithSlots` - Modal Sandbox wrapper with slot-based multiplexing
|
||||||
|
- `_ModalSandboxPool` - Auto-scaling pool of Modal sandboxes
|
||||||
|
- `_ModalMultiProfileManager` - Multi-profile support (CPU, GPU, high-memory)
|
||||||
|
- `ModalToolBackend` - Full ToolBackend implementation
|
||||||
|
2. `atropos/backends/__init__.py` - Updated `create_tool_backend()` to support `modal` mode
|
||||||
|
3. `tools/terminal_tool.py` - Native Modal Sandbox integration with:
|
||||||
|
- `ModalProfile` config + YAML loading
|
||||||
|
- `_ModalSandboxPool` (sync, thread-based for CLI use)
|
||||||
|
- `_ModalPoolManager` (singleton, multi-profile)
|
||||||
|
- `_ModalSandboxEnvironment` replacing old `_ModalEnvironment`
|
||||||
|
4. `docs/MODAL_BACKEND.md` - Comprehensive documentation
|
||||||
|
5. `modal_profiles.yaml.example` - Example profiles config
|
||||||
|
6. `tests/test_modal_integration.py` - Integration tests
|
||||||
|
7. `tests/test_modal_stress.py` - Stress tests
|
||||||
|
8. `tests/test_modal_terminal.py` - Terminal tool tests
|
||||||
|
|
||||||
|
**What I fixed after merge:**
|
||||||
|
1. `atropos/envs/agent_env.py` - Replaced old stub Modal fields with proper config fields matching `ModalSandboxConfig.from_agent_env_config()`:
|
||||||
|
- `modal_image`, `modal_gpu`, `modal_cpu`, `modal_memory`
|
||||||
|
- `modal_slots_per_sandbox`, `modal_min_sandboxes`, `modal_max_sandboxes`
|
||||||
|
- `modal_idle_timeout`, `modal_max_lifetime`
|
||||||
|
- `modal_acquire_timeout`, `modal_execution_timeout`
|
||||||
|
- `modal_secrets`, `modal_env_vars`, `modal_workspace_base`
|
||||||
|
2. `atropos/backends/modal_backend.py` - Guarded `yaml` import with try/except
|
||||||
|
|
||||||
|
**Key Architecture Decisions:**
|
||||||
|
- Uses **Modal Sandboxes** (not Functions) - long-lived containers that stay hot
|
||||||
|
- Uses `sandbox.exec()` directly instead of HTTP/sandbox_server.py - simpler approach
|
||||||
|
- Slot-based multiplexing matching Nomad's pattern
|
||||||
|
- Multi-profile support for heterogeneous workloads (CPU vs GPU)
|
||||||
|
- Named sandbox recovery for resilience
|
||||||
|
- Modal SDK v1.3.2 compatible
|
||||||
|
|
||||||
|
## Previous Work (Feb 6, 2026)
|
||||||
### Singularity/Apptainer Sandbox Integration - FULLY WORKING
|
### Singularity/Apptainer Sandbox Integration - FULLY WORKING
|
||||||
Successfully adapted the Atropos implementation from Docker to Singularity/Apptainer for HPC clusters where Docker cannot run without sudo permissions.
|
See progress.md for details.
|
||||||
|
|
||||||
**Files Modified:**
|
|
||||||
1. `atropos/nomad/client.py` - Added `driver` and `singularity_image` parameters to `create_sandbox_job()`; Fixed port detection to check both `DynamicPorts` and `ReservedPorts` in `get_job_allocations()`
|
|
||||||
2. `atropos/slots/pool.py` - Added `driver` and `singularity_image` to `SlotPoolConfig`
|
|
||||||
3. `atropos/backends/nomad_backend.py` - Added driver options to `NomadBackendConfig`
|
|
||||||
4. `atropos/envs/agent_env.py` - Added CLI arguments `--env.driver` and `--env.singularity_image` to `AgentEnvConfig`
|
|
||||||
|
|
||||||
**Files Created:**
|
|
||||||
1. `nomad-singularity.hcl` - Nomad config with raw_exec driver enabled
|
|
||||||
2. `atropos/atropos-sandbox.sif` - Singularity image (80MB) built from Docker image
|
|
||||||
3. `test_singularity_job.py` - Test script for Singularity integration
|
|
||||||
|
|
||||||
**Key Implementation Details:**
|
|
||||||
- Uses Nomad's `raw_exec` driver to run `apptainer` commands
|
|
||||||
- Shell wrapper (`/bin/sh -c`) ensures Nomad environment variables expand correctly
|
|
||||||
- Binds Nomad allocation directory to `/data` for workspace persistence
|
|
||||||
- Uses **static ports** (`ReservedPorts`) instead of dynamic ports since raw_exec runs directly on host
|
|
||||||
- `get_job_allocations()` now checks both `DynamicPorts` (Docker) and `ReservedPorts` (Singularity)
|
|
||||||
|
|
||||||
**Test Results (All Passing):**
|
|
||||||
- Health check: ✅ Server responding with 5 slots
|
|
||||||
- Bash execution: ✅ Commands execute inside Singularity container
|
|
||||||
- Write file: ✅ File written to slot workspace
|
|
||||||
- Read file: ✅ File read back successfully
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
### For Docker (default):
|
### Modal Backend (Atropos):
|
||||||
```python
|
|
||||||
config = SlotPoolConfig(
|
|
||||||
driver="docker",
|
|
||||||
image="atropos-sandbox:local",
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### For Singularity/Apptainer:
|
|
||||||
```python
|
|
||||||
config = SlotPoolConfig(
|
|
||||||
driver="singularity",
|
|
||||||
singularity_image="/path/to/atropos-sandbox.sif",
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Nomad Configuration:
|
|
||||||
```bash
|
```bash
|
||||||
# Start Nomad with Singularity support
|
python -m atropos.envs.swe_smith_oracle_env process \
|
||||||
nomad agent -dev -config=nomad-singularity.hcl
|
--env.tool_pool_mode modal \
|
||||||
|
--env.modal_image python:3.11 \
|
||||||
|
--env.modal_slots_per_sandbox 10 \
|
||||||
|
--env.modal_max_sandboxes 5
|
||||||
|
```
|
||||||
|
|
||||||
|
### Modal Terminal Tool (CLI):
|
||||||
|
```bash
|
||||||
|
export TERMINAL_ENV=modal
|
||||||
|
export TERMINAL_MODAL_IMAGE=python:3.11
|
||||||
|
./hermes
|
||||||
|
```
|
||||||
|
|
||||||
|
### With GPU Profile:
|
||||||
|
```bash
|
||||||
|
# In modal_profiles.yaml
|
||||||
|
profiles:
|
||||||
|
pytorch-gpu:
|
||||||
|
image: pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime
|
||||||
|
gpu: T4
|
||||||
|
memory: 16384
|
||||||
```
|
```
|
||||||
|
|
||||||
## Next Steps
|
## Next Steps
|
||||||
- Deploy to HPC cluster for production testing
|
- Live test Modal backend with actual Modal credentials
|
||||||
- Consider adding bubblewrap (bwrap) support inside Singularity for additional sandboxing
|
- Test multi-profile GPU workflows
|
||||||
- Document HPC-specific deployment procedures in skills/mlops/
|
- Test sandbox recovery after restart
|
||||||
|
- Integrate with SWE-smith-oracle env for full GRPO training loop
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,45 @@
|
||||||
|
|
||||||
## Completed Features
|
## Completed Features
|
||||||
|
|
||||||
|
### ✅ Modal Backend Integration (Feb 8, 2026 - MERGED & TESTED)
|
||||||
|
Merged the `modal-integration` branch and fixed integration issues.
|
||||||
|
|
||||||
|
**What Works:**
|
||||||
|
- `ModalToolBackend` implements full `ToolBackend` interface (start, stop, acquire, release, execute_batch)
|
||||||
|
- Modal Sandboxes used for long-lived containers (not Functions)
|
||||||
|
- `sandbox.exec()` for direct command execution (no HTTP server needed)
|
||||||
|
- Slot-based multiplexing matching Nomad pattern
|
||||||
|
- Multi-profile support (`ModalSandboxConfig`, `_ModalMultiProfileManager`)
|
||||||
|
- YAML profile loading (`modal_profiles.yaml`)
|
||||||
|
- `AgentEnvConfig` fields for all Modal settings (`--env.modal_*`)
|
||||||
|
- `create_tool_backend()` supports `tool_pool_mode="modal"`
|
||||||
|
- Terminal tool (`tools/terminal_tool.py`) native Modal integration with pool management
|
||||||
|
- Named sandbox recovery via `Sandbox.from_name()`
|
||||||
|
- Auto-scaling sandbox pool per profile
|
||||||
|
- Artifact helpers (read, list, archive)
|
||||||
|
|
||||||
|
**CLI Usage:**
|
||||||
|
```bash
|
||||||
|
# Atropos backend
|
||||||
|
python -m atropos.envs.swe_smith_oracle_env process \
|
||||||
|
--env.tool_pool_mode modal \
|
||||||
|
--env.modal_image python:3.11
|
||||||
|
|
||||||
|
# Terminal tool
|
||||||
|
TERMINAL_ENV=modal ./hermes
|
||||||
|
```
|
||||||
|
|
||||||
|
**Files Modified/Created:**
|
||||||
|
- `atropos/backends/modal_backend.py` - Full implementation (~1200 lines)
|
||||||
|
- `atropos/backends/__init__.py` - `create_tool_backend()` updated
|
||||||
|
- `atropos/envs/agent_env.py` - 15 Modal config fields added
|
||||||
|
- `tools/terminal_tool.py` - Native Modal sandbox pool
|
||||||
|
- `docs/MODAL_BACKEND.md` - Documentation
|
||||||
|
- `modal_profiles.yaml.example` - Example profiles
|
||||||
|
- `tests/test_modal_integration.py` - Integration tests
|
||||||
|
- `tests/test_modal_stress.py` - Stress tests
|
||||||
|
- `tests/test_modal_terminal.py` - Terminal tool tests
|
||||||
|
|
||||||
### ✅ Singularity/Apptainer Sandbox Integration (Feb 6, 2026 - FULLY TESTED)
|
### ✅ Singularity/Apptainer Sandbox Integration (Feb 6, 2026 - FULLY TESTED)
|
||||||
Adapted the Atropos sandbox environment from Docker to Singularity/Apptainer for HPC clusters.
|
Adapted the Atropos sandbox environment from Docker to Singularity/Apptainer for HPC clusters.
|
||||||
|
|
||||||
|
|
@ -10,28 +49,8 @@ Adapted the Atropos sandbox environment from Docker to Singularity/Apptainer for
|
||||||
- SlotPoolConfig and NomadBackendConfig propagate driver settings
|
- SlotPoolConfig and NomadBackendConfig propagate driver settings
|
||||||
- Singularity container runs sandbox_server.py via Nomad's raw_exec driver
|
- Singularity container runs sandbox_server.py via Nomad's raw_exec driver
|
||||||
- All sandbox operations work: bash execution, file read/write
|
- All sandbox operations work: bash execution, file read/write
|
||||||
- Nomad environment variables properly expanded via shell wrapper
|
|
||||||
- **CLI arguments** `--env.driver` and `--env.singularity_image` for AgentEnvConfig
|
- **CLI arguments** `--env.driver` and `--env.singularity_image` for AgentEnvConfig
|
||||||
- **Static port binding** for Singularity (ReservedPorts vs DynamicPorts)
|
- **Static port binding** for Singularity (ReservedPorts vs DynamicPorts)
|
||||||
- **Port detection** works for both Docker and Singularity allocations
|
|
||||||
|
|
||||||
**CLI Usage:**
|
|
||||||
```bash
|
|
||||||
python -m atropos.envs.swe_smith_oracle_env process \
|
|
||||||
--env.driver singularity \
|
|
||||||
--env.singularity_image /path/to/atropos-sandbox.sif
|
|
||||||
```
|
|
||||||
|
|
||||||
**Created Files:**
|
|
||||||
- `nomad-singularity.hcl` - Nomad config with raw_exec enabled
|
|
||||||
- `atropos/atropos-sandbox.sif` - 80MB Singularity image
|
|
||||||
- `test_singularity_job.py` - Integration test script
|
|
||||||
|
|
||||||
**Modified Files:**
|
|
||||||
- `atropos/nomad/client.py` - driver support + ReservedPorts detection
|
|
||||||
- `atropos/slots/pool.py` - driver config fields
|
|
||||||
- `atropos/backends/nomad_backend.py` - driver config fields
|
|
||||||
- `atropos/envs/agent_env.py` - CLI arguments for driver selection
|
|
||||||
|
|
||||||
### ✅ Memory Bank Initialized (Feb 5, 2026)
|
### ✅ Memory Bank Initialized (Feb 5, 2026)
|
||||||
Set up project documentation structure for context persistence.
|
Set up project documentation structure for context persistence.
|
||||||
|
|
@ -40,19 +59,22 @@ Set up project documentation structure for context persistence.
|
||||||
None currently.
|
None currently.
|
||||||
|
|
||||||
## Known Issues
|
## Known Issues
|
||||||
- `bwrap_available: false` in Singularity containers - bubblewrap sandboxing not available inside the container (kernel namespaces already in use)
|
- Modal backend not yet live-tested with actual Modal cloud credentials
|
||||||
|
- `bwrap_available: false` in Singularity containers
|
||||||
- Health check timing - may need longer wait for container startup on slower systems
|
- Health check timing - may need longer wait for container startup on slower systems
|
||||||
|
|
||||||
## What's Left to Build
|
## What's Left to Build
|
||||||
|
|
||||||
|
### Modal Backend
|
||||||
|
- [ ] Live test with Modal credentials on actual cloud
|
||||||
|
- [ ] Test multi-profile GPU workflows
|
||||||
|
- [ ] Test sandbox recovery after restart
|
||||||
|
- [ ] Integrate with SWE-smith-oracle env for GRPO training loop
|
||||||
|
- [ ] Performance benchmarking vs Nomad backend
|
||||||
|
|
||||||
### HPC Deployment
|
### HPC Deployment
|
||||||
- [ ] Test on actual HPC cluster with Slurm/PBS integration
|
- [ ] Test on actual HPC cluster with Slurm/PBS integration
|
||||||
- [ ] Document cluster-specific deployment procedures
|
- [ ] Document cluster-specific deployment procedures
|
||||||
- [ ] Add support for shared filesystem workspace binding
|
|
||||||
|
|
||||||
### Enhanced Sandboxing
|
|
||||||
- [ ] Investigate alternative sandboxing inside Singularity (seccomp, etc.)
|
|
||||||
- [ ] Add network isolation options for Singularity
|
|
||||||
|
|
||||||
### Documentation
|
### Documentation
|
||||||
- [ ] Add Singularity deployment to README
|
- [ ] Add Singularity deployment to README
|
||||||
|
|
@ -65,3 +87,10 @@ None currently.
|
||||||
- **Problem**: HPC clusters don't allow Docker without sudo
|
- **Problem**: HPC clusters don't allow Docker without sudo
|
||||||
- **Solution**: Added Singularity/Apptainer support via raw_exec driver
|
- **Solution**: Added Singularity/Apptainer support via raw_exec driver
|
||||||
- **Result**: Both runtimes now supported with same API
|
- **Result**: Both runtimes now supported with same API
|
||||||
|
|
||||||
|
### Modal Backend Architecture
|
||||||
|
- **Initial**: Stub placeholder raising RuntimeError
|
||||||
|
- **Investigation**: Modal Sandboxes vs Functions - chose Sandboxes for long-lived containers
|
||||||
|
- **Design**: Direct `sandbox.exec()` instead of HTTP/sandbox_server.py (simpler, no networking needed)
|
||||||
|
- **Implementation**: Merged from `modal-integration` branch, fixed agent_env.py config fields
|
||||||
|
- **Result**: Three backends now supported: Nomad/Docker, Nomad/Singularity, Modal
|
||||||
|
|
|
||||||
|
|
@ -147,3 +147,45 @@ The agent validates responses before accepting:
|
||||||
3. Sets environment variables for terminal config
|
3. Sets environment variables for terminal config
|
||||||
4. `AIAgent` reads env vars when initializing terminal tool
|
4. `AIAgent` reads env vars when initializing terminal tool
|
||||||
5. Terminal tool creates appropriate backend based on `TERMINAL_ENV`
|
5. Terminal tool creates appropriate backend based on `TERMINAL_ENV`
|
||||||
|
|
||||||
|
## Atropos Backend Architecture
|
||||||
|
|
||||||
|
### Backend Hierarchy
|
||||||
|
```
|
||||||
|
ToolBackend (Protocol - base.py)
|
||||||
|
├── NomadToolBackend → SlotPool → NomadClient + SandboxExecutor (HTTP)
|
||||||
|
│ ├── Docker driver (default)
|
||||||
|
│ └── Singularity driver (HPC)
|
||||||
|
└── ModalToolBackend → _ModalSandboxPool → modal.Sandbox.exec() (direct)
|
||||||
|
└── _ModalMultiProfileManager (multi-profile support)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Slot-Based Multiplexing Pattern
|
||||||
|
All backends share the same slot multiplexing concept:
|
||||||
|
- **Sandbox/Container**: Long-lived compute unit
|
||||||
|
- **Slot**: Isolated workspace directory within a sandbox (e.g., `/data/slot_0`)
|
||||||
|
- **Trajectory**: One agent task using one slot
|
||||||
|
- Multiple trajectories share a sandbox via different slots
|
||||||
|
|
||||||
|
### Nomad Backend (HTTP-based)
|
||||||
|
- Deploys `sandbox_server.py` inside containers (Docker or Singularity)
|
||||||
|
- Uses `SandboxExecutor` for HTTP communication (POST /execute, POST /batch)
|
||||||
|
- Nomad manages container lifecycle (scaling, health checks)
|
||||||
|
- Tools: bash, bash_stateful, read_file, write_file, tmux
|
||||||
|
|
||||||
|
### Modal Backend (exec-based)
|
||||||
|
- Creates `modal.Sandbox` instances (long-lived containers)
|
||||||
|
- Uses `sandbox.exec("bash", "-c", command)` directly (no HTTP server)
|
||||||
|
- Modal manages container lifecycle (idle_timeout, max_lifetime)
|
||||||
|
- Multi-profile support: different resource configs (CPU, GPU, memory)
|
||||||
|
- Named sandboxes for recovery: `Sandbox.from_name(app_name, sandbox_name)`
|
||||||
|
- YAML config via `modal_profiles.yaml`
|
||||||
|
|
||||||
|
### Backend Selection
|
||||||
|
```python
|
||||||
|
# In agent_env.py / create_tool_backend()
|
||||||
|
if mode == "nomad":
|
||||||
|
return NomadToolBackend(NomadBackendConfig.from_agent_env_config(cfg))
|
||||||
|
if mode == "modal":
|
||||||
|
return ModalToolBackend(ModalSandboxConfig.from_agent_env_config(cfg))
|
||||||
|
```
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue