From 975c8493087350573933891d16d889d43385a818 Mon Sep 17 00:00:00 2001 From: Shannon Sands Date: Tue, 10 Feb 2026 01:45:07 +0000 Subject: [PATCH] Add GSM8k agent env using proper HermesAgentBaseEnv (not ICL) - environments/gsm8k_agent_env.py: Math reasoning with Python REPL tool - Subclasses HermesAgentBaseEnv (proper tools= parameter, not ICL) - Uses ATROPOS_SERVER_* env vars from .env - Hermes tool call parser, configurable per model - Math verification via math_verify with string fallback - Tested: process mode works, both trajectories scored 1.0 - Updated memory bank with consolidation plan: - environments/ is the canonical env system (proper tool calling) - atropos/backends/ kept as sandbox infrastructure - atropos/agent/ and atropos/envs/agent_env.py marked for removal --- environments/gsm8k_agent_env.py | 350 ++++++++++++++++++++++++++++++++ memory-bank/activeContext.md | 132 +++++++----- memory-bank/progress.md | 145 ++++++------- memory-bank/systemPatterns.md | 83 +++++--- 4 files changed, 555 insertions(+), 155 deletions(-) create mode 100644 environments/gsm8k_agent_env.py diff --git a/environments/gsm8k_agent_env.py b/environments/gsm8k_agent_env.py new file mode 100644 index 00000000000..f6bbab89642 --- /dev/null +++ b/environments/gsm8k_agent_env.py @@ -0,0 +1,350 @@ +""" +GSM8kAgentEnv -- Math Reasoning with Tool Use (Python REPL) + +An agentic RL environment where models solve GSM8k math problems using +a Python interpreter tool. Uses proper OpenAI-spec tool calling via +HermesAgentBaseEnv (not ICL). + +The model: +1. Receives a math problem +2. Can call the `terminal` tool to run Python code (`python3 -c "..."`) +3. Provides a final answer in \\boxed{} format +4. Gets reward: 1.0 if correct, 0.0 if wrong + +Usage: + # Phase 1 (OpenRouter, no training): + python environments/gsm8k_agent_env.py process \\ + --env.data_path_to_save_groups gsm8k_agent_output.jsonl + + # Phase 2 (VLLM + Tinker training): + run-api + python launch_training.py --config configs/gsm8k_agent.yaml + python environments/gsm8k_agent_env.py serve +""" + +import logging +import os +import sys +import time +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +# Ensure repo root is on sys.path +_repo_root = Path(__file__).resolve().parent.parent +if str(_repo_root) not in sys.path: + sys.path.insert(0, str(_repo_root)) + +from atroposlib.envs.base import ScoredDataGroup +from atroposlib.envs.server_handling.server_manager import APIServerConfig +from atroposlib.type_definitions import Item + +from environments.agent_loop import AgentResult +from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig +from environments.tool_context import ToolContext + +logger = logging.getLogger(__name__) + + +# ============================================================================= +# Math verification helpers +# ============================================================================= + +def _verify_math_answer(model_response: str, gold_answer: str) -> bool: + """ + Verify if the model's response contains the correct answer. + Uses math_verify for robust LaTeX comparison, falls back to string matching. + """ + try: + from latex2sympy2_extended import NormalizationConfig + from math_verify import LatexExtractionConfig, parse, verify + + gold_parsed = parse( + f"\\boxed{{{gold_answer}}}", + extraction_mode="first_match", + extraction_config=[LatexExtractionConfig()], + ) + + # Strip blocks if present + answer_text = model_response + if "" in answer_text: + answer_text = answer_text.split("")[-1] + + answer_parsed = parse( + answer_text, + extraction_config=[ + LatexExtractionConfig( + normalization_config=NormalizationConfig( + nits=False, + malformed_operators=False, + basic_latex=True, + boxed="all", + units=True, + ), + boxed_match_priority=0, + try_extract_without_anchor=False, + ) + ], + extraction_mode="first_match", + ) + + return bool(verify(answer_parsed, gold_parsed)) + + except ImportError: + # Fallback: simple string matching for \\boxed{answer} + import re + pattern = r'\\boxed\{([^}]+)\}' + matches = re.findall(pattern, model_response) + if matches: + model_answer = matches[-1].strip().replace(",", "") + gold_clean = gold_answer.strip().replace(",", "") + return model_answer == gold_clean + return False + + +# ============================================================================= +# Environment Config +# ============================================================================= + +class GSM8kAgentEnvConfig(HermesAgentEnvConfig): + """Config with defaults for GSM8k agent environment.""" + pass + + +# ============================================================================= +# Environment +# ============================================================================= + +class GSM8kAgentEnv(HermesAgentBaseEnv): + """ + GSM8k math environment with Python REPL tool calling. + + Models solve grade-school math problems by reasoning step by step + and using Python (via the terminal tool) for calculations. + + Exercises the full agentic RL training loop: + - Model receives math problem + - Makes tool calls to compute (python3 -c "...") + - Provides final answer in \\boxed{} + - Reward: binary (1.0 correct, 0.0 wrong) + """ + + name = "gsm8k-agent" + env_config_cls = GSM8kAgentEnvConfig + + @classmethod + def config_init(cls) -> Tuple[GSM8kAgentEnvConfig, List[APIServerConfig]]: + """ + Default config using terminal tool. + + Reads from environment variables (set in .env): + ATROPOS_SERVER_BASE_URL - Inference server URL + ATROPOS_SERVER_MODEL - Model name on the server + ATROPOS_TOKENIZER_NAME - HuggingFace tokenizer name + ATROPOS_SERVER_API_KEY - API key for the server + """ + # Resolve inference server settings from env + base_url = ( + os.getenv("ATROPOS_SERVER_BASE_URL") + or os.getenv("OPENAI_BASE_URL") + or os.getenv("LLM_BASE_URL") + or "https://openrouter.ai/api/v1" + ) + if not base_url.rstrip("/").endswith("/v1"): + base_url = base_url.rstrip("/") + "/v1" + + model = ( + os.getenv("ATROPOS_SERVER_MODEL") + or os.getenv("LLM_MODEL") + or "Hermes-4.3-36B" + ) + + api_key = ( + os.getenv("ATROPOS_SERVER_API_KEY") + or os.getenv("NOUS_API_KEY") + or os.getenv("OPENROUTER_API_KEY") + or os.getenv("OPENAI_API_KEY") + or "" + ) + + tokenizer = ( + os.getenv("ATROPOS_TOKENIZER_NAME") + or os.getenv("ATROPOS_TOKENIZER") + or "NousResearch/Hermes-4.3-36B" + ) + + env_config = GSM8kAgentEnvConfig( + # Terminal tool only -- model uses `python3 -c "..."` for math + enabled_toolsets=["terminal"], + disabled_toolsets=None, + distribution=None, + # Agent settings + max_agent_turns=5, # Math problems don't need many turns + max_token_length=2048, # Room for reasoning + code + agent_temperature=1.0, + system_prompt=( + "You are a helpful math assistant. You have access to a terminal " + "where you can run Python code to help solve problems.\n\n" + "When you need to calculate something, use the terminal tool with " + "a command like: python3 -c \"print(2 + 2)\"\n\n" + "When you have the final answer, write it inside \\boxed{} like: \\boxed{42}\n\n" + "Work step by step. Use Python to verify your reasoning." + ), + # Terminal backend (local for testing, modal for production) + terminal_backend=os.getenv("TERMINAL_ENV", "local"), + # Parser -- hermes format for Hermes models + tool_call_parser="hermes", + # Atropos settings + group_size=4, + tokenizer_name=tokenizer, + steps_per_eval=5, + total_steps=10, + use_wandb=bool(os.getenv("WANDB_API_KEY")), + wandb_name="gsm8k-agent", + ensure_scores_are_not_same=False, + # No external dataset (we load GSM8k ourselves) + dataset_name=None, + ) + + server_configs = [ + APIServerConfig( + base_url=base_url, + model_name=model, + server_type="openai", + api_key=api_key, + health_check=False, + ) + ] + + return env_config, server_configs + + async def setup(self): + """Load GSM8k dataset.""" + from datasets import load_dataset + + self.train = load_dataset("gsm8k", "main", split="train").shuffle(seed=42) + test_data = load_dataset("gsm8k", "main", split="test").shuffle(seed=42) + self.test = [ + { + "question": item["question"], + "gold_answer": item["answer"].split("#")[-1].strip().replace(",", ""), + } + for item in test_data + ] + self.iter = 0 + self.reward_buffer: List[float] = [] + self.tool_use_buffer: List[int] = [] + print(f"[GSM8kAgentEnv] Loaded {len(self.train)} train, {len(self.test)} test examples") + + async def get_next_item(self) -> Dict[str, str]: + """Cycle through training problems.""" + item = self.train[self.iter % len(self.train)] + self.iter += 1 + return { + "question": item["question"], + "gold_answer": item["answer"].split("#")[-1].strip().replace(",", ""), + } + + def format_prompt(self, item: Dict[str, str]) -> str: + """Format the math problem as a user message.""" + return item["question"] + + async def compute_reward( + self, item: Dict[str, str], result: AgentResult, ctx: ToolContext + ) -> float: + """ + Score: verify the model's \\boxed{} answer against the gold answer. + + The agent has full access to terminal via ctx, but for GSM8k we just + check the final answer from the conversation. + """ + # Get the last assistant message content + final_text = "" + for msg in reversed(result.messages): + if msg.get("role") == "assistant" and msg.get("content"): + final_text = msg["content"] + break + + correct = _verify_math_answer(final_text, item["gold_answer"]) + reward = 1.0 if correct else 0.0 + + self.reward_buffer.append(reward) + # Count tool calls in this trajectory + tool_call_count = sum( + len(msg.get("tool_calls", [])) + for msg in result.messages + if msg.get("role") == "assistant" + ) + self.tool_use_buffer.append(tool_call_count) + + return reward + + async def evaluate(self, *args, **kwargs): + """Evaluate on a subset of the test set (greedy, no tools for speed).""" + start_time = time.time() + correct = 0 + total = 0 + samples = [] + + eval_subset = self.test[:30] # Small subset for quick eval + + for item in eval_subset: + try: + completion = await self.server.chat_completion( + messages=[ + {"role": "system", "content": self.config.system_prompt or ""}, + {"role": "user", "content": item["question"]}, + ], + n=1, + max_tokens=self.config.max_token_length, + temperature=0.0, + split="eval", + ) + + response = completion.choices[0].message.content or "" + is_correct = _verify_math_answer(response, item["gold_answer"]) + + if is_correct: + correct += 1 + total += 1 + + samples.append({ + "question": item["question"], + "gold_answer": item["gold_answer"], + "response": response[:500], + "correct": is_correct, + }) + + except Exception as e: + logger.error("Eval failed: %s", e) + total += 1 + + percent_correct = correct / total if total > 0 else 0 + end_time = time.time() + + await self.evaluate_log( + metrics={"eval/percent_correct": percent_correct, "eval/total": total}, + samples=samples, + start_time=start_time, + end_time=end_time, + ) + + async def wandb_log(self, wandb_metrics: Optional[Dict] = None): + """Log training metrics.""" + if wandb_metrics is None: + wandb_metrics = {} + + if self.reward_buffer: + wandb_metrics["train/percent_correct"] = sum(self.reward_buffer) / len(self.reward_buffer) + wandb_metrics["train/total_rollouts"] = len(self.reward_buffer) + self.reward_buffer = [] + + if self.tool_use_buffer: + wandb_metrics["train/avg_tool_calls"] = sum(self.tool_use_buffer) / len(self.tool_use_buffer) + wandb_metrics["train/tool_use_rate"] = sum(1 for t in self.tool_use_buffer if t > 0) / len(self.tool_use_buffer) + self.tool_use_buffer = [] + + await super().wandb_log(wandb_metrics) + + +if __name__ == "__main__": + GSM8kAgentEnv.cli() diff --git a/memory-bank/activeContext.md b/memory-bank/activeContext.md index b7858c2b131..7a6d9b24ed9 100644 --- a/memory-bank/activeContext.md +++ b/memory-bank/activeContext.md @@ -1,61 +1,99 @@ # Active Context ## Current Focus -Tinker RL training integration - pipeline fully wired up, waiting on Tinker billing to test. +Consolidating the two Atropos environment systems and fixing tool calling to use proper OpenAI-spec approach instead of ICL. -## Recently Completed (Feb 9, 2026) +## PR Feedback from Lead Dev (Feb 10, 2026) -### Tinker RL Training Integration -Created a complete agent training pipeline using Tinker (Thinking Machines) + Atropos: +The PR was rejected because our approach has three fundamental issues: -**New Files Created:** -1. `tinker-atropos/tinker_atropos/environments/gsm8k_agent.py` - Agent GSM8k environment with: - - Python REPL tool calling (Hermes-style `` format) - - Multi-step agent loop within `collect_trajectories()` - - Math answer verification via `math_verify` - - Subprocess-based Python execution - - WandB metrics (percent_correct, tool_use_rate) -2. `tinker-atropos/configs/gsm8k_agent.yaml` - Config for Qwen3-4B-Instruct training +### Issue 1: ManagedServer doesn't pass `tools={}` to `apply_chat_template()` +- When using Phase 2 (VLLM/SGLang for RL training), `ManagedServer` needs to pass tools to `tokenizer.apply_chat_template(tools=...)` +- This makes the system prompt include tool definitions the way models were trained to expect +- **Fix**: Atropos PR #366 adds `tool_call_parser` support to ManagedServer (branch: `tool_call_support`) -**Dependencies Updated:** -- `pyproject.toml` `[atropos]` extra now includes: tinker SDK, torch, wandb, math-verify -- Installed: tinker 0.12.0, tinker-atropos 0.1.0, torch (CPU) +### Issue 2: ICL prompt vs proper tool calling +- Our code embeds tools as XML in the system prompt (`...`) +- Proper approach: pass `tools=` parameter in `chat_completion()` calls and let the tokenizer's chat template handle formatting +- All Hermes datasets train on the proper format, not ICL -**README Updated:** -- Added comprehensive "RL Training with Tinker" section with architecture diagram, quick start, config docs -- Added TINKER_API_KEY and WANDB_API_KEY to optional keys table +### Issue 3: Only Hermes `` parser, no multi-model support +- Our code only handles Hermes-style `` XML parsing +- Proper approach: parser registry supporting 11+ model families (hermes, qwen, deepseek, llama, mistral, etc.) -**Verified Working:** -- Tinker SDK connection ✅ -- All imports (tinker, tinker_atropos, trainer, environment) ✅ -- Python REPL execution + tool call parsing ✅ -- Math verification ✅ -- Atropos run-api (port 8000) ✅ -- Tinker trainer starts, loads config, creates inference server (port 8001) ✅ - -**Blocked:** Tinker billing (402 error) - user's payment didn't process (possibly regional card issue) - -### Main Branch Merge (Feb 9, 2026) -Merged `origin/main` into `atropos-integrations` - 22,560 lines, 79 files, 5 conflicts resolved. - -### Modal Backend (Feb 8, 2026) -Merged modal-integration branch, working with Modal Sandboxes. - -### Singularity/Apptainer (Feb 6, 2026) -Completed and tested. - -## Architecture: Training Pipeline +## Architecture: What Exists Now (Two Parallel Systems) +### `environments/` (Teknium's proper approach) ✅ CORRECT ``` -Terminal 1: run-api (port 8000) - Atropos Rollout API -Terminal 2: launch_training.py (port 8001) - Tinker Trainer + FastAPI inference -Terminal 3: gsm8k_agent.py serve - Environment (generates trajectories) +environments/ +├── agent_loop.py ← Uses tools= in chat_completion() (OpenAI spec) +├── hermes_base_env.py ← Phase 1 (OpenAI) + Phase 2 (ManagedServer + parser) +├── tool_context.py ← ToolContext for reward functions +├── tool_call_parsers/ ← 11 model parsers (hermes, qwen, deepseek, llama, etc.) +│ ├── __init__.py ← Registry with get_parser(), register_parser() +│ ├── hermes_parser.py +│ ├── qwen_parser.py +│ ├── deepseek_v3_parser.py +│ ├── llama_parser.py +│ ├── mistral_parser.py +│ └── ... (11 total) +├── terminal_test_env.py ← Working example: file creation tasks +├── hermes_swe_env.py ← SWE environment +└── patches.py ← Async-safe monkey patches ``` -The agent env gets math problems → model calls Python REPL tool → scores answer → sends to Atropos → Tinker does LoRA training → updates sampling weights → repeat. +**How it works correctly:** +1. `HermesAgentLoop.run()` passes `tools=self.tool_schemas` to `chat_completion()` +2. ManagedServer passes tools to `tokenizer.apply_chat_template(tools=...)` +3. Parser registry reconstructs `tool_calls` from raw model output +4. Tool execution uses hermes-agent's `handle_function_call()` from `model_tools.py` -## Next Steps -- [ ] Resolve Tinker billing to test full training loop -- [ ] Run GSM8k agent training for ~20 steps (proof of concept) -- [ ] Monitor WandB for reward improvement -- [ ] Graduate to more complex agent envs (SWE tasks with Modal backend) +### `atropos/` (Our sandbox-optimized code) - PARTIALLY REDUNDANT +``` +atropos/ +├── agent/atropos_agent.py ← ICL-based agent (REDUNDANT with agent_loop.py) +├── envs/agent_env.py ← Environment with sandbox backends (PARTIALLY REDUNDANT) +├── envs/swe_smith_oracle_env.py ← SWE env using sandbox (KEEP - port to new base) +├── backends/ ← Sandbox backends (KEEP - valuable infrastructure) +│ ├── modal_backend.py ← Modal sandbox pool +│ ├── nomad_backend.py ← Nomad/Docker/Singularity +│ └── base.py ← ToolBackend protocol +├── slots/ ← Slot multiplexing (KEEP) +├── nomad/ ← Nomad client (KEEP) +├── tools/ ← Sandbox tool registry (PARTIALLY REDUNDANT) +└── sandbox_server.py ← HTTP server in containers (KEEP) +``` + +## Plan: Consolidate into `environments/` + +### What to KEEP from `atropos/`: +- `backends/` - Modal, Nomad, Singularity backends (valuable infrastructure for scale) +- `slots/` - Slot multiplexing +- `nomad/` - Nomad client +- `sandbox_server.py` - Container HTTP server +- `Dockerfile` - Sandbox container image + +### What to REMOVE/REPLACE: +- `atropos/agent/atropos_agent.py` → replaced by `environments/agent_loop.py` +- `atropos/envs/agent_env.py` → functionality merged into `environments/hermes_base_env.py` +- `atropos/tools/` → replaced by `model_tools.py` + `tools/` (hermes-agent's standard tools) + +### What to CREATE: +- `environments/gsm8k_agent_env.py` → GSM8k with tool calling, subclasses `HermesAgentBaseEnv` +- Update `environments/hermes_base_env.py` to optionally use sandbox backends (Nomad/Modal) for terminal isolation when needed for scale + +### Steps: +1. Install atropos `tool_call_support` branch (PR #366) +2. Create `environments/gsm8k_agent_env.py` using `HermesAgentBaseEnv` +3. Port `swe_smith_oracle_env.py` to use `HermesAgentBaseEnv` +4. Make sandbox backends accessible from `HermesAgentBaseEnv` (terminal_backend config) +5. Remove redundant `atropos/agent/` and `atropos/envs/agent_env.py` +6. Clean up `atropos/tools/` (keep only sandbox-specific tools) +7. Update tinker-atropos gsm8k env to use proper base class +8. Test everything end-to-end + +## Previous Completed Work +- Modal backend integration (Feb 8) - KEEP backends, update integration point +- Main branch merge (Feb 9) - completed +- Singularity/Apptainer (Feb 6) - KEEP +- Memory Bank initialized (Feb 5) diff --git a/memory-bank/progress.md b/memory-bank/progress.md index e8d9f6c33ba..9b00d751cee 100644 --- a/memory-bank/progress.md +++ b/memory-bank/progress.md @@ -1,96 +1,85 @@ # Progress +## Current Sprint: Consolidate Environment Systems (Feb 10, 2026) + +PR feedback from lead dev identified three fundamental issues with our approach: +1. Tool calling uses ICL (in-context learning) instead of proper `tools=` parameter +2. ManagedServer doesn't pass tools to `apply_chat_template()` +3. Only Hermes parser, no multi-model support + +Teknium already built the correct approach in `environments/` directory. Our task is to consolidate. + +### Status +- [ ] Install atropos `tool_call_support` branch (PR #366) +- [ ] Create `environments/gsm8k_agent_env.py` using `HermesAgentBaseEnv` +- [ ] Port SWE env to `HermesAgentBaseEnv` +- [ ] Make sandbox backends accessible from `HermesAgentBaseEnv` +- [ ] Remove redundant `atropos/agent/` and `atropos/envs/agent_env.py` +- [ ] Clean up redundant `atropos/tools/` +- [ ] Test end-to-end with Tinker + ## Completed Features -### ✅ Modal Backend Integration (Feb 8, 2026 - MERGED & TESTED) -Merged the `modal-integration` branch and fixed integration issues. +### ✅ Modal Backend Integration (Feb 8, 2026) +- `ModalToolBackend` with slot-based multiplexing +- Multi-profile support (CPU, GPU, high-memory) +- Auto-scaling sandbox pool via Modal Sandboxes +- **Status: KEEP backends, but change integration point from atropos/envs/ to environments/** -**What Works:** -- `ModalToolBackend` implements full `ToolBackend` interface (start, stop, acquire, release, execute_batch) -- Modal Sandboxes used for long-lived containers (not Functions) -- `sandbox.exec()` for direct command execution (no HTTP server needed) -- Slot-based multiplexing matching Nomad pattern -- Multi-profile support (`ModalSandboxConfig`, `_ModalMultiProfileManager`) -- YAML profile loading (`modal_profiles.yaml`) -- `AgentEnvConfig` fields for all Modal settings (`--env.modal_*`) -- `create_tool_backend()` supports `tool_pool_mode="modal"` -- Terminal tool (`tools/terminal_tool.py`) native Modal integration with pool management -- Named sandbox recovery via `Sandbox.from_name()` -- Auto-scaling sandbox pool per profile -- Artifact helpers (read, list, archive) +### ✅ Main Branch Merge (Feb 9, 2026) +- Merged 22,560 lines, 79 files, 5 conflicts resolved +- New: hermes_cli/, file_operations, RL training tools, gateway, cron -**CLI Usage:** -```bash -# Atropos backend -python -m atropos.envs.swe_smith_oracle_env process \ - --env.tool_pool_mode modal \ - --env.modal_image python:3.11 +### ✅ Tinker RL Training Setup (Feb 9, 2026) +- tinker 0.12.0 + tinker-atropos installed +- GSM8k agent env created (needs rewrite to use proper base class) +- Config for Qwen3-4B created +- Pipeline verified: Tinker API connection works, all imports pass +- **Blocked on billing** (Tinker 402 error - regional payment issue) -# Terminal tool -TERMINAL_ENV=modal ./hermes -``` +### ✅ Singularity/Apptainer Sandbox (Feb 6, 2026) +- Nomad raw_exec driver for HPC clusters +- All sandbox operations tested and working -**Files Modified/Created:** -- `atropos/backends/modal_backend.py` - Full implementation (~1200 lines) -- `atropos/backends/__init__.py` - `create_tool_backend()` updated -- `atropos/envs/agent_env.py` - 15 Modal config fields added -- `tools/terminal_tool.py` - Native Modal sandbox pool -- `docs/MODAL_BACKEND.md` - Documentation -- `modal_profiles.yaml.example` - Example profiles -- `tests/test_modal_integration.py` - Integration tests -- `tests/test_modal_stress.py` - Stress tests -- `tests/test_modal_terminal.py` - Terminal tool tests +### ✅ Memory Bank (Feb 5, 2026) +- Project documentation structure initialized -### ✅ Singularity/Apptainer Sandbox Integration (Feb 6, 2026 - FULLY TESTED) -Adapted the Atropos sandbox environment from Docker to Singularity/Apptainer for HPC clusters. +## What to KEEP vs REMOVE -**What Works:** -- `create_sandbox_job()` supports both `driver="docker"` and `driver="singularity"` -- SlotPoolConfig and NomadBackendConfig propagate driver settings -- Singularity container runs sandbox_server.py via Nomad's raw_exec driver -- All sandbox operations work: bash execution, file read/write -- **CLI arguments** `--env.driver` and `--env.singularity_image` for AgentEnvConfig -- **Static port binding** for Singularity (ReservedPorts vs DynamicPorts) +### KEEP (valuable infrastructure): +| Component | Location | Purpose | +|-----------|----------|---------| +| Modal backend | `atropos/backends/modal_backend.py` | Cloud sandbox pool | +| Nomad backend | `atropos/backends/nomad_backend.py` | Docker/Singularity sandboxes | +| Slot pool | `atropos/slots/` | Container multiplexing | +| Nomad client | `atropos/nomad/` | Nomad API | +| Sandbox server | `atropos/sandbox_server.py` | HTTP server in containers | +| Dockerfile | `atropos/Dockerfile` | Container image | +| Agent loop | `environments/agent_loop.py` | Proper OpenAI-spec tool calling | +| Base env | `environments/hermes_base_env.py` | Phase 1/2 with parsers | +| Tool parsers | `environments/tool_call_parsers/` | 11+ model parsers | -### ✅ Memory Bank Initialized (Feb 5, 2026) -Set up project documentation structure for context persistence. - -## In Progress -None currently. +### REMOVE (redundant with environments/): +| Component | Location | Replaced By | +|-----------|----------|-------------| +| ICL agent | `atropos/agent/atropos_agent.py` | `environments/agent_loop.py` | +| AgentEnv | `atropos/envs/agent_env.py` | `environments/hermes_base_env.py` | +| Tool registry | `atropos/tools/` | `model_tools.py` + `tools/` | +| GSM8k ICL env | `tinker-atropos/.../gsm8k_agent.py` | New proper version | ## Known Issues -- Modal backend not yet live-tested with actual Modal cloud credentials +- Tinker billing (402 error) - user's payment didn't process - `bwrap_available: false` in Singularity containers -- Health check timing - may need longer wait for container startup on slower systems - -## What's Left to Build - -### Modal Backend -- [ ] Live test with Modal credentials on actual cloud -- [ ] Test multi-profile GPU workflows -- [ ] Test sandbox recovery after restart -- [ ] Integrate with SWE-smith-oracle env for GRPO training loop -- [ ] Performance benchmarking vs Nomad backend - -### HPC Deployment -- [ ] Test on actual HPC cluster with Slurm/PBS integration -- [ ] Document cluster-specific deployment procedures - -### Documentation -- [ ] Add Singularity deployment to README -- [ ] Create HPC deployment skill in skills/mlops/ +- atropos `tool_call_support` branch not yet installed (PR #366) ## Evolution of Decisions -### Container Runtime Selection -- **Initial**: Docker-only via Nomad docker driver -- **Problem**: HPC clusters don't allow Docker without sudo -- **Solution**: Added Singularity/Apptainer support via raw_exec driver -- **Result**: Both runtimes now supported with same API +### Agent Architecture +- **v1 (our branch)**: ICL-based agent with `` XML tags in system prompt +- **v2 (Teknium's)**: Proper OpenAI-spec tool calling with `tools=` parameter +- **Decision**: Adopt v2, consolidate into `environments/`, keep sandbox backends from v1 -### Modal Backend Architecture -- **Initial**: Stub placeholder raising RuntimeError -- **Investigation**: Modal Sandboxes vs Functions - chose Sandboxes for long-lived containers -- **Design**: Direct `sandbox.exec()` instead of HTTP/sandbox_server.py (simpler, no networking needed) -- **Implementation**: Merged from `modal-integration` branch, fixed agent_env.py config fields -- **Result**: Three backends now supported: Nomad/Docker, Nomad/Singularity, Modal +### Environment Organization +- **Before**: Two parallel systems (`atropos/envs/` and `environments/`) +- **After**: Single system in `environments/`, using `HermesAgentBaseEnv` as base class +- Sandbox backends remain in `atropos/backends/` but integrate via terminal backend config diff --git a/memory-bank/systemPatterns.md b/memory-bank/systemPatterns.md index 64ef9a328ff..b5ebb9f341e 100644 --- a/memory-bank/systemPatterns.md +++ b/memory-bank/systemPatterns.md @@ -148,11 +148,50 @@ The agent validates responses before accepting: 4. `AIAgent` reads env vars when initializing terminal tool 5. Terminal tool creates appropriate backend based on `TERMINAL_ENV` -## Atropos Backend Architecture +## RL Training Architecture (Consolidated) + +### Environment System (`environments/`) + +The canonical way to build agentic RL environments in Hermes-Agent: -### Backend Hierarchy ``` -ToolBackend (Protocol - base.py) +environments/ +├── agent_loop.py ← HermesAgentLoop: OpenAI-spec tool calling +├── hermes_base_env.py ← HermesAgentBaseEnv: base class for all envs +├── tool_context.py ← ToolContext: reward function tool access +├── tool_call_parsers/ ← 11+ model parsers (hermes, qwen, deepseek, etc.) +├── terminal_test_env.py ← Example: file creation tasks +├── hermes_swe_env.py ← SWE environment +└── gsm8k_agent_env.py ← GSM8k with Python REPL (TODO) +``` + +### Two-Phase Operation +- **Phase 1 (OpenAI server)**: Native tool_calls from VLLM/SGLang/OpenRouter + - Good for: SFT data gen, testing, evaluation +- **Phase 2 (ManagedServer)**: Client-side tool call parser + logprob tracking + - Required for: RL training + - Parser registry selects per-model parser (hermes, qwen, llama, etc.) + +### Key Design: Proper Tool Calling (NOT ICL) +```python +# CORRECT: pass tools= to chat_completion() +response = await server.chat_completion( + messages=messages, + tools=tool_schemas, # ← tokenizer.apply_chat_template(tools=...) formats these + temperature=1.0, +) +# Response has response.choices[0].message.tool_calls (structured objects) + +# WRONG (old approach): embed tools in system prompt as XML +system_prompt = f"{json.dumps(tools)}" # ← ICL, not proper training format +``` + +### Sandbox Backends (`atropos/backends/`) + +Infrastructure for scaled sandbox execution (separate from the env system): + +``` +ToolBackend (Protocol) ├── NomadToolBackend → SlotPool → NomadClient + SandboxExecutor (HTTP) │ ├── Docker driver (default) │ └── Singularity driver (HPC) @@ -160,32 +199,16 @@ ToolBackend (Protocol - base.py) └── _ModalMultiProfileManager (multi-profile support) ``` -### Slot-Based Multiplexing Pattern -All backends share the same slot multiplexing concept: -- **Sandbox/Container**: Long-lived compute unit -- **Slot**: Isolated workspace directory within a sandbox (e.g., `/data/slot_0`) -- **Trajectory**: One agent task using one slot -- Multiple trajectories share a sandbox via different slots +Accessed via `HermesAgentBaseEnv.terminal_backend` config option: +- `local` - Direct execution (default, development) +- `docker` - Docker containers +- `modal` - Modal cloud sandboxes (production RL) +- `singularity` - HPC clusters +- `ssh` - Remote server -### Nomad Backend (HTTP-based) -- Deploys `sandbox_server.py` inside containers (Docker or Singularity) -- Uses `SandboxExecutor` for HTTP communication (POST /execute, POST /batch) -- Nomad manages container lifecycle (scaling, health checks) -- Tools: bash, bash_stateful, read_file, write_file, tmux - -### Modal Backend (exec-based) -- Creates `modal.Sandbox` instances (long-lived containers) -- Uses `sandbox.exec("bash", "-c", command)` directly (no HTTP server) -- Modal manages container lifecycle (idle_timeout, max_lifetime) -- Multi-profile support: different resource configs (CPU, GPU, memory) -- Named sandboxes for recovery: `Sandbox.from_name(app_name, sandbox_name)` -- YAML config via `modal_profiles.yaml` - -### Backend Selection -```python -# In agent_env.py / create_tool_backend() -if mode == "nomad": - return NomadToolBackend(NomadBackendConfig.from_agent_env_config(cfg)) -if mode == "modal": - return ModalToolBackend(ModalSandboxConfig.from_agent_env_config(cfg)) +### Training Pipeline (Tinker + Atropos) +``` +Terminal 1: run-api (port 8000) ← Atropos Rollout API +Terminal 2: launch_training.py (port 8001) ← Tinker Trainer + inference +Terminal 3: environment.py serve ← Environment (rollouts) ```