From 975c8493087350573933891d16d889d43385a818 Mon Sep 17 00:00:00 2001
From: Shannon Sands <shannon@nousresearch.com>
Date: Tue, 10 Feb 2026 01:45:07 +0000
Subject: [PATCH] Add GSM8k agent env using proper HermesAgentBaseEnv (not ICL)

- environments/gsm8k_agent_env.py: Math reasoning with Python REPL tool
  - Subclasses HermesAgentBaseEnv (proper tools= parameter, not ICL)
  - Uses ATROPOS_SERVER_* env vars from .env
  - Hermes tool call parser, configurable per model
  - Math verification via math_verify with string fallback
  - Tested: process mode works, both trajectories scored 1.0

- Updated memory bank with consolidation plan:
  - environments/ is the canonical env system (proper tool calling)
  - atropos/backends/ kept as sandbox infrastructure
  - atropos/agent/ and atropos/envs/agent_env.py marked for removal
---
 environments/gsm8k_agent_env.py | 350 ++++++++++++++++++++++++++++++++
 memory-bank/activeContext.md    | 132 +++++++-----
 memory-bank/progress.md         | 145 ++++++-------
 memory-bank/systemPatterns.md   |  83 +++++---
 4 files changed, 555 insertions(+), 155 deletions(-)
 create mode 100644 environments/gsm8k_agent_env.py
diff --git a/environments/gsm8k_agent_env.py b/environments/gsm8k_agent_env.py
new file mode 100644
index 00000000000..f6bbab89642
--- /dev/null
+++ b/environments/gsm8k_agent_env.py
@@ -0,0 +1,350 @@
+"""
+GSM8kAgentEnv -- Math Reasoning with Tool Use (Python REPL)
+
+An agentic RL environment where models solve GSM8k math problems using
+a Python interpreter tool. Uses proper OpenAI-spec tool calling via
+HermesAgentBaseEnv (not ICL).
+
+The model:
+1. Receives a math problem
+2. Can call the `terminal` tool to run Python code (`python3 -c "..."`)
+3. Provides a final answer in \\boxed{} format
+4. Gets reward: 1.0 if correct, 0.0 if wrong
+
+Usage:
+    # Phase 1 (OpenRouter, no training):
+    python environments/gsm8k_agent_env.py process \\
+        --env.data_path_to_save_groups gsm8k_agent_output.jsonl
+
+    # Phase 2 (VLLM + Tinker training):
+    run-api
+    python launch_training.py --config configs/gsm8k_agent.yaml
+    python environments/gsm8k_agent_env.py serve
+"""
+
+import logging
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+# Ensure repo root is on sys.path
+_repo_root = Path(__file__).resolve().parent.parent
+if str(_repo_root) not in sys.path:
+    sys.path.insert(0, str(_repo_root))
+
+from atroposlib.envs.base import ScoredDataGroup
+from atroposlib.envs.server_handling.server_manager import APIServerConfig
+from atroposlib.type_definitions import Item
+
+from environments.agent_loop import AgentResult
+from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
+from environments.tool_context import ToolContext
+
+logger = logging.getLogger(__name__)
+
+
+# =============================================================================
+# Math verification helpers
+# =============================================================================
+
+def _verify_math_answer(model_response: str, gold_answer: str) -> bool:
+    """
+    Verify if the model's response contains the correct answer.
+    Uses math_verify for robust LaTeX comparison, falls back to string matching.
+    """
+    try:
+        from latex2sympy2_extended import NormalizationConfig
+        from math_verify import LatexExtractionConfig, parse, verify
+
+        gold_parsed = parse(
+            f"\\boxed{{{gold_answer}}}",
+            extraction_mode="first_match",
+            extraction_config=[LatexExtractionConfig()],
+        )
+
+        # Strip <think> blocks if present
+        answer_text = model_response
+        if "</think>" in answer_text:
+            answer_text = answer_text.split("</think>")[-1]
+
+        answer_parsed = parse(
+            answer_text,
+            extraction_config=[
+                LatexExtractionConfig(
+                    normalization_config=NormalizationConfig(
+                        nits=False,
+                        malformed_operators=False,
+                        basic_latex=True,
+                        boxed="all",
+                        units=True,
+                    ),
+                    boxed_match_priority=0,
+                    try_extract_without_anchor=False,
+                )
+            ],
+            extraction_mode="first_match",
+        )
+
+        return bool(verify(answer_parsed, gold_parsed))
+
+    except ImportError:
+        # Fallback: simple string matching for \\boxed{answer}
+        import re
+        pattern = r'\\boxed\{([^}]+)\}'
+        matches = re.findall(pattern, model_response)
+        if matches:
+            model_answer = matches[-1].strip().replace(",", "")
+            gold_clean = gold_answer.strip().replace(",", "")
+            return model_answer == gold_clean
+        return False
+
+
+# =============================================================================
+# Environment Config
+# =============================================================================
+
+class GSM8kAgentEnvConfig(HermesAgentEnvConfig):
+    """Config with defaults for GSM8k agent environment."""
+    pass
+
+
+# =============================================================================
+# Environment
+# =============================================================================
+
+class GSM8kAgentEnv(HermesAgentBaseEnv):
+    """
+    GSM8k math environment with Python REPL tool calling.
+
+    Models solve grade-school math problems by reasoning step by step
+    and using Python (via the terminal tool) for calculations.
+
+    Exercises the full agentic RL training loop:
+    - Model receives math problem
+    - Makes tool calls to compute (python3 -c "...")
+    - Provides final answer in \\boxed{}
+    - Reward: binary (1.0 correct, 0.0 wrong)
+    """
+
+    name = "gsm8k-agent"
+    env_config_cls = GSM8kAgentEnvConfig
+
+    @classmethod
+    def config_init(cls) -> Tuple[GSM8kAgentEnvConfig, List[APIServerConfig]]:
+        """
+        Default config using terminal tool.
+
+        Reads from environment variables (set in .env):
+            ATROPOS_SERVER_BASE_URL  - Inference server URL
+            ATROPOS_SERVER_MODEL     - Model name on the server
+            ATROPOS_TOKENIZER_NAME   - HuggingFace tokenizer name
+            ATROPOS_SERVER_API_KEY   - API key for the server
+        """
+        # Resolve inference server settings from env
+        base_url = (
+            os.getenv("ATROPOS_SERVER_BASE_URL")
+            or os.getenv("OPENAI_BASE_URL")
+            or os.getenv("LLM_BASE_URL")
+            or "https://openrouter.ai/api/v1"
+        )
+        if not base_url.rstrip("/").endswith("/v1"):
+            base_url = base_url.rstrip("/") + "/v1"
+
+        model = (
+            os.getenv("ATROPOS_SERVER_MODEL")
+            or os.getenv("LLM_MODEL")
+            or "Hermes-4.3-36B"
+        )
+
+        api_key = (
+            os.getenv("ATROPOS_SERVER_API_KEY")
+            or os.getenv("NOUS_API_KEY")
+            or os.getenv("OPENROUTER_API_KEY")
+            or os.getenv("OPENAI_API_KEY")
+            or ""
+        )
+
+        tokenizer = (
+            os.getenv("ATROPOS_TOKENIZER_NAME")
+            or os.getenv("ATROPOS_TOKENIZER")
+            or "NousResearch/Hermes-4.3-36B"
+        )
+
+        env_config = GSM8kAgentEnvConfig(
+            # Terminal tool only -- model uses `python3 -c "..."` for math
+            enabled_toolsets=["terminal"],
+            disabled_toolsets=None,
+            distribution=None,
+            # Agent settings
+            max_agent_turns=5,          # Math problems don't need many turns
+            max_token_length=2048,      # Room for reasoning + code
+            agent_temperature=1.0,
+            system_prompt=(
+                "You are a helpful math assistant. You have access to a terminal "
+                "where you can run Python code to help solve problems.\n\n"
+                "When you need to calculate something, use the terminal tool with "
+                "a command like: python3 -c \"print(2 + 2)\"\n\n"
+                "When you have the final answer, write it inside \\boxed{} like: \\boxed{42}\n\n"
+                "Work step by step. Use Python to verify your reasoning."
+            ),
+            # Terminal backend (local for testing, modal for production)
+            terminal_backend=os.getenv("TERMINAL_ENV", "local"),
+            # Parser -- hermes format for Hermes models
+            tool_call_parser="hermes",
+            # Atropos settings
+            group_size=4,
+            tokenizer_name=tokenizer,
+            steps_per_eval=5,
+            total_steps=10,
+            use_wandb=bool(os.getenv("WANDB_API_KEY")),
+            wandb_name="gsm8k-agent",
+            ensure_scores_are_not_same=False,
+            # No external dataset (we load GSM8k ourselves)
+            dataset_name=None,
+        )
+
+        server_configs = [
+            APIServerConfig(
+                base_url=base_url,
+                model_name=model,
+                server_type="openai",
+                api_key=api_key,
+                health_check=False,
+            )
+        ]
+
+        return env_config, server_configs
+
+    async def setup(self):
+        """Load GSM8k dataset."""
+        from datasets import load_dataset
+
+        self.train = load_dataset("gsm8k", "main", split="train").shuffle(seed=42)
+        test_data = load_dataset("gsm8k", "main", split="test").shuffle(seed=42)
+        self.test = [
+            {
+                "question": item["question"],
+                "gold_answer": item["answer"].split("#")[-1].strip().replace(",", ""),
+            }
+            for item in test_data
+        ]
+        self.iter = 0
+        self.reward_buffer: List[float] = []
+        self.tool_use_buffer: List[int] = []
+        print(f"[GSM8kAgentEnv] Loaded {len(self.train)} train, {len(self.test)} test examples")
+
+    async def get_next_item(self) -> Dict[str, str]:
+        """Cycle through training problems."""
+        item = self.train[self.iter % len(self.train)]
+        self.iter += 1
+        return {
+            "question": item["question"],
+            "gold_answer": item["answer"].split("#")[-1].strip().replace(",", ""),
+        }
+
+    def format_prompt(self, item: Dict[str, str]) -> str:
+        """Format the math problem as a user message."""
+        return item["question"]
+
+    async def compute_reward(
+        self, item: Dict[str, str], result: AgentResult, ctx: ToolContext
+    ) -> float:
+        """
+        Score: verify the model's \\boxed{} answer against the gold answer.
+
+        The agent has full access to terminal via ctx, but for GSM8k we just
+        check the final answer from the conversation.
+        """
+        # Get the last assistant message content
+        final_text = ""
+        for msg in reversed(result.messages):
+            if msg.get("role") == "assistant" and msg.get("content"):
+                final_text = msg["content"]
+                break
+
+        correct = _verify_math_answer(final_text, item["gold_answer"])
+        reward = 1.0 if correct else 0.0
+
+        self.reward_buffer.append(reward)
+        # Count tool calls in this trajectory
+        tool_call_count = sum(
+            len(msg.get("tool_calls", []))
+            for msg in result.messages
+            if msg.get("role") == "assistant"
+        )
+        self.tool_use_buffer.append(tool_call_count)
+
+        return reward
+
+    async def evaluate(self, *args, **kwargs):
+        """Evaluate on a subset of the test set (greedy, no tools for speed)."""
+        start_time = time.time()
+        correct = 0
+        total = 0
+        samples = []
+
+        eval_subset = self.test[:30]  # Small subset for quick eval
+
+        for item in eval_subset:
+            try:
+                completion = await self.server.chat_completion(
+                    messages=[
+                        {"role": "system", "content": self.config.system_prompt or ""},
+                        {"role": "user", "content": item["question"]},
+                    ],
+                    n=1,
+                    max_tokens=self.config.max_token_length,
+                    temperature=0.0,
+                    split="eval",
+                )
+
+                response = completion.choices[0].message.content or ""
+                is_correct = _verify_math_answer(response, item["gold_answer"])
+
+                if is_correct:
+                    correct += 1
+                total += 1
+
+                samples.append({
+                    "question": item["question"],
+                    "gold_answer": item["gold_answer"],
+                    "response": response[:500],
+                    "correct": is_correct,
+                })
+
+            except Exception as e:
+                logger.error("Eval failed: %s", e)
+                total += 1
+
+        percent_correct = correct / total if total > 0 else 0
+        end_time = time.time()
+
+        await self.evaluate_log(
+            metrics={"eval/percent_correct": percent_correct, "eval/total": total},
+            samples=samples,
+            start_time=start_time,
+            end_time=end_time,
+        )
+
+    async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
+        """Log training metrics."""
+        if wandb_metrics is None:
+            wandb_metrics = {}
+
+        if self.reward_buffer:
+            wandb_metrics["train/percent_correct"] = sum(self.reward_buffer) / len(self.reward_buffer)
+            wandb_metrics["train/total_rollouts"] = len(self.reward_buffer)
+            self.reward_buffer = []
+
+        if self.tool_use_buffer:
+            wandb_metrics["train/avg_tool_calls"] = sum(self.tool_use_buffer) / len(self.tool_use_buffer)
+            wandb_metrics["train/tool_use_rate"] = sum(1 for t in self.tool_use_buffer if t > 0) / len(self.tool_use_buffer)
+            self.tool_use_buffer = []
+
+        await super().wandb_log(wandb_metrics)
+
+
+if __name__ == "__main__":
+    GSM8kAgentEnv.cli()
diff --git a/memory-bank/activeContext.md b/memory-bank/activeContext.md
index b7858c2b131..7a6d9b24ed9 100644
--- a/memory-bank/activeContext.md
+++ b/memory-bank/activeContext.md
@@ -1,61 +1,99 @@
 # Active Context
 
 ## Current Focus
-Tinker RL training integration - pipeline fully wired up, waiting on Tinker billing to test.
+Consolidating the two Atropos environment systems and fixing tool calling to use proper OpenAI-spec approach instead of ICL.
 
-## Recently Completed (Feb 9, 2026)
+## PR Feedback from Lead Dev (Feb 10, 2026)
 
-### Tinker RL Training Integration
-Created a complete agent training pipeline using Tinker (Thinking Machines) + Atropos:
+The PR was rejected because our approach has three fundamental issues:
 
-**New Files Created:**
-1. `tinker-atropos/tinker_atropos/environments/gsm8k_agent.py` - Agent GSM8k environment with:
-   - Python REPL tool calling (Hermes-style `<tool_call>` format)
-   - Multi-step agent loop within `collect_trajectories()`
-   - Math answer verification via `math_verify`
-   - Subprocess-based Python execution
-   - WandB metrics (percent_correct, tool_use_rate)
-2. `tinker-atropos/configs/gsm8k_agent.yaml` - Config for Qwen3-4B-Instruct training
+### Issue 1: ManagedServer doesn't pass `tools={}` to `apply_chat_template()`
+- When using Phase 2 (VLLM/SGLang for RL training), `ManagedServer` needs to pass tools to `tokenizer.apply_chat_template(tools=...)` 
+- This makes the system prompt include tool definitions the way models were trained to expect
+- **Fix**: Atropos PR #366 adds `tool_call_parser` support to ManagedServer (branch: `tool_call_support`)
 
-**Dependencies Updated:**
-- `pyproject.toml` `[atropos]` extra now includes: tinker SDK, torch, wandb, math-verify
-- Installed: tinker 0.12.0, tinker-atropos 0.1.0, torch (CPU)
+### Issue 2: ICL prompt vs proper tool calling
+- Our code embeds tools as XML in the system prompt (`<tools>...</tools>`)
+- Proper approach: pass `tools=` parameter in `chat_completion()` calls and let the tokenizer's chat template handle formatting
+- All Hermes datasets train on the proper format, not ICL
 
-**README Updated:**
-- Added comprehensive "RL Training with Tinker" section with architecture diagram, quick start, config docs
-- Added TINKER_API_KEY and WANDB_API_KEY to optional keys table
+### Issue 3: Only Hermes `<tool_call>` parser, no multi-model support
+- Our code only handles Hermes-style `<tool_call>` XML parsing
+- Proper approach: parser registry supporting 11+ model families (hermes, qwen, deepseek, llama, mistral, etc.)
 
-**Verified Working:**
-- Tinker SDK connection ✅
-- All imports (tinker, tinker_atropos, trainer, environment) ✅
-- Python REPL execution + tool call parsing ✅
-- Math verification ✅
-- Atropos run-api (port 8000) ✅
-- Tinker trainer starts, loads config, creates inference server (port 8001) ✅
-
-**Blocked:** Tinker billing (402 error) - user's payment didn't process (possibly regional card issue)
-
-### Main Branch Merge (Feb 9, 2026)
-Merged `origin/main` into `atropos-integrations` - 22,560 lines, 79 files, 5 conflicts resolved.
-
-### Modal Backend (Feb 8, 2026)
-Merged modal-integration branch, working with Modal Sandboxes.
-
-### Singularity/Apptainer (Feb 6, 2026)
-Completed and tested.
-
-## Architecture: Training Pipeline
+## Architecture: What Exists Now (Two Parallel Systems)
 
+### `environments/` (Teknium's proper approach) ✅ CORRECT
 ```
-Terminal 1: run-api (port 8000) - Atropos Rollout API
-Terminal 2: launch_training.py (port 8001) - Tinker Trainer + FastAPI inference
-Terminal 3: gsm8k_agent.py serve - Environment (generates trajectories)
+environments/
+├── agent_loop.py              ← Uses tools= in chat_completion() (OpenAI spec)
+├── hermes_base_env.py         ← Phase 1 (OpenAI) + Phase 2 (ManagedServer + parser)
+├── tool_context.py            ← ToolContext for reward functions
+├── tool_call_parsers/         ← 11 model parsers (hermes, qwen, deepseek, llama, etc.)
+│   ├── __init__.py            ← Registry with get_parser(), register_parser()
+│   ├── hermes_parser.py
+│   ├── qwen_parser.py
+│   ├── deepseek_v3_parser.py
+│   ├── llama_parser.py
+│   ├── mistral_parser.py
+│   └── ... (11 total)
+├── terminal_test_env.py       ← Working example: file creation tasks
+├── hermes_swe_env.py          ← SWE environment
+└── patches.py                 ← Async-safe monkey patches
 ```
 
-The agent env gets math problems → model calls Python REPL tool → scores answer → sends to Atropos → Tinker does LoRA training → updates sampling weights → repeat.
+**How it works correctly:**
+1. `HermesAgentLoop.run()` passes `tools=self.tool_schemas` to `chat_completion()`
+2. ManagedServer passes tools to `tokenizer.apply_chat_template(tools=...)`
+3. Parser registry reconstructs `tool_calls` from raw model output
+4. Tool execution uses hermes-agent's `handle_function_call()` from `model_tools.py`
 
-## Next Steps
-- [ ] Resolve Tinker billing to test full training loop
-- [ ] Run GSM8k agent training for ~20 steps (proof of concept)
-- [ ] Monitor WandB for reward improvement
-- [ ] Graduate to more complex agent envs (SWE tasks with Modal backend)
+### `atropos/` (Our sandbox-optimized code) - PARTIALLY REDUNDANT
+```
+atropos/
+├── agent/atropos_agent.py     ← ICL-based agent (REDUNDANT with agent_loop.py)
+├── envs/agent_env.py          ← Environment with sandbox backends (PARTIALLY REDUNDANT)
+├── envs/swe_smith_oracle_env.py ← SWE env using sandbox (KEEP - port to new base)
+├── backends/                  ← Sandbox backends (KEEP - valuable infrastructure)
+│   ├── modal_backend.py       ← Modal sandbox pool
+│   ├── nomad_backend.py       ← Nomad/Docker/Singularity
+│   └── base.py                ← ToolBackend protocol
+├── slots/                     ← Slot multiplexing (KEEP)
+├── nomad/                     ← Nomad client (KEEP)
+├── tools/                     ← Sandbox tool registry (PARTIALLY REDUNDANT)
+└── sandbox_server.py          ← HTTP server in containers (KEEP)
+```
+
+## Plan: Consolidate into `environments/`
+
+### What to KEEP from `atropos/`:
+- `backends/` - Modal, Nomad, Singularity backends (valuable infrastructure for scale)
+- `slots/` - Slot multiplexing
+- `nomad/` - Nomad client
+- `sandbox_server.py` - Container HTTP server
+- `Dockerfile` - Sandbox container image
+
+### What to REMOVE/REPLACE:
+- `atropos/agent/atropos_agent.py` → replaced by `environments/agent_loop.py`
+- `atropos/envs/agent_env.py` → functionality merged into `environments/hermes_base_env.py`
+- `atropos/tools/` → replaced by `model_tools.py` + `tools/` (hermes-agent's standard tools)
+
+### What to CREATE:
+- `environments/gsm8k_agent_env.py` → GSM8k with tool calling, subclasses `HermesAgentBaseEnv`
+- Update `environments/hermes_base_env.py` to optionally use sandbox backends (Nomad/Modal) for terminal isolation when needed for scale
+
+### Steps:
+1. Install atropos `tool_call_support` branch (PR #366)
+2. Create `environments/gsm8k_agent_env.py` using `HermesAgentBaseEnv`
+3. Port `swe_smith_oracle_env.py` to use `HermesAgentBaseEnv`
+4. Make sandbox backends accessible from `HermesAgentBaseEnv` (terminal_backend config)
+5. Remove redundant `atropos/agent/` and `atropos/envs/agent_env.py`
+6. Clean up `atropos/tools/` (keep only sandbox-specific tools)
+7. Update tinker-atropos gsm8k env to use proper base class
+8. Test everything end-to-end
+
+## Previous Completed Work
+- Modal backend integration (Feb 8) - KEEP backends, update integration point
+- Main branch merge (Feb 9) - completed
+- Singularity/Apptainer (Feb 6) - KEEP
+- Memory Bank initialized (Feb 5)
diff --git a/memory-bank/progress.md b/memory-bank/progress.md
index e8d9f6c33ba..9b00d751cee 100644
--- a/memory-bank/progress.md
+++ b/memory-bank/progress.md
@@ -1,96 +1,85 @@
 # Progress
 
+## Current Sprint: Consolidate Environment Systems (Feb 10, 2026)
+
+PR feedback from lead dev identified three fundamental issues with our approach:
+1. Tool calling uses ICL (in-context learning) instead of proper `tools=` parameter
+2. ManagedServer doesn't pass tools to `apply_chat_template()`
+3. Only Hermes parser, no multi-model support
+
+Teknium already built the correct approach in `environments/` directory. Our task is to consolidate.
+
+### Status
+- [ ] Install atropos `tool_call_support` branch (PR #366)
+- [ ] Create `environments/gsm8k_agent_env.py` using `HermesAgentBaseEnv`
+- [ ] Port SWE env to `HermesAgentBaseEnv`
+- [ ] Make sandbox backends accessible from `HermesAgentBaseEnv`
+- [ ] Remove redundant `atropos/agent/` and `atropos/envs/agent_env.py`
+- [ ] Clean up redundant `atropos/tools/`
+- [ ] Test end-to-end with Tinker
+
 ## Completed Features
 
-### ✅ Modal Backend Integration (Feb 8, 2026 - MERGED & TESTED)
-Merged the `modal-integration` branch and fixed integration issues.
+### ✅ Modal Backend Integration (Feb 8, 2026)
+- `ModalToolBackend` with slot-based multiplexing
+- Multi-profile support (CPU, GPU, high-memory)
+- Auto-scaling sandbox pool via Modal Sandboxes
+- **Status: KEEP backends, but change integration point from atropos/envs/ to environments/**
 
-**What Works:**
-- `ModalToolBackend` implements full `ToolBackend` interface (start, stop, acquire, release, execute_batch)
-- Modal Sandboxes used for long-lived containers (not Functions)
-- `sandbox.exec()` for direct command execution (no HTTP server needed)
-- Slot-based multiplexing matching Nomad pattern
-- Multi-profile support (`ModalSandboxConfig`, `_ModalMultiProfileManager`)
-- YAML profile loading (`modal_profiles.yaml`)
-- `AgentEnvConfig` fields for all Modal settings (`--env.modal_*`)
-- `create_tool_backend()` supports `tool_pool_mode="modal"`
-- Terminal tool (`tools/terminal_tool.py`) native Modal integration with pool management
-- Named sandbox recovery via `Sandbox.from_name()`
-- Auto-scaling sandbox pool per profile
-- Artifact helpers (read, list, archive)
+### ✅ Main Branch Merge (Feb 9, 2026)
+- Merged 22,560 lines, 79 files, 5 conflicts resolved
+- New: hermes_cli/, file_operations, RL training tools, gateway, cron
 
-**CLI Usage:**
-```bash
-# Atropos backend
-python -m atropos.envs.swe_smith_oracle_env process \
-    --env.tool_pool_mode modal \
-    --env.modal_image python:3.11
+### ✅ Tinker RL Training Setup (Feb 9, 2026)
+- tinker 0.12.0 + tinker-atropos installed
+- GSM8k agent env created (needs rewrite to use proper base class)
+- Config for Qwen3-4B created
+- Pipeline verified: Tinker API connection works, all imports pass
+- **Blocked on billing** (Tinker 402 error - regional payment issue)
 
-# Terminal tool
-TERMINAL_ENV=modal ./hermes
-```
+### ✅ Singularity/Apptainer Sandbox (Feb 6, 2026)
+- Nomad raw_exec driver for HPC clusters
+- All sandbox operations tested and working
 
-**Files Modified/Created:**
-- `atropos/backends/modal_backend.py` - Full implementation (~1200 lines)
-- `atropos/backends/__init__.py` - `create_tool_backend()` updated
-- `atropos/envs/agent_env.py` - 15 Modal config fields added
-- `tools/terminal_tool.py` - Native Modal sandbox pool
-- `docs/MODAL_BACKEND.md` - Documentation
-- `modal_profiles.yaml.example` - Example profiles
-- `tests/test_modal_integration.py` - Integration tests
-- `tests/test_modal_stress.py` - Stress tests
-- `tests/test_modal_terminal.py` - Terminal tool tests
+### ✅ Memory Bank (Feb 5, 2026)
+- Project documentation structure initialized
 
-### ✅ Singularity/Apptainer Sandbox Integration (Feb 6, 2026 - FULLY TESTED)
-Adapted the Atropos sandbox environment from Docker to Singularity/Apptainer for HPC clusters.
+## What to KEEP vs REMOVE
 
-**What Works:**
-- `create_sandbox_job()` supports both `driver="docker"` and `driver="singularity"`
-- SlotPoolConfig and NomadBackendConfig propagate driver settings
-- Singularity container runs sandbox_server.py via Nomad's raw_exec driver
-- All sandbox operations work: bash execution, file read/write
-- **CLI arguments** `--env.driver` and `--env.singularity_image` for AgentEnvConfig
-- **Static port binding** for Singularity (ReservedPorts vs DynamicPorts)
+### KEEP (valuable infrastructure):
+| Component | Location | Purpose |
+|-----------|----------|---------|
+| Modal backend | `atropos/backends/modal_backend.py` | Cloud sandbox pool |
+| Nomad backend | `atropos/backends/nomad_backend.py` | Docker/Singularity sandboxes |
+| Slot pool | `atropos/slots/` | Container multiplexing |
+| Nomad client | `atropos/nomad/` | Nomad API |
+| Sandbox server | `atropos/sandbox_server.py` | HTTP server in containers |
+| Dockerfile | `atropos/Dockerfile` | Container image |
+| Agent loop | `environments/agent_loop.py` | Proper OpenAI-spec tool calling |
+| Base env | `environments/hermes_base_env.py` | Phase 1/2 with parsers |
+| Tool parsers | `environments/tool_call_parsers/` | 11+ model parsers |
 
-### ✅ Memory Bank Initialized (Feb 5, 2026)
-Set up project documentation structure for context persistence.
-
-## In Progress
-None currently.
+### REMOVE (redundant with environments/):
+| Component | Location | Replaced By |
+|-----------|----------|-------------|
+| ICL agent | `atropos/agent/atropos_agent.py` | `environments/agent_loop.py` |
+| AgentEnv | `atropos/envs/agent_env.py` | `environments/hermes_base_env.py` |
+| Tool registry | `atropos/tools/` | `model_tools.py` + `tools/` |
+| GSM8k ICL env | `tinker-atropos/.../gsm8k_agent.py` | New proper version |
 
 ## Known Issues
-- Modal backend not yet live-tested with actual Modal cloud credentials
+- Tinker billing (402 error) - user's payment didn't process
 - `bwrap_available: false` in Singularity containers
-- Health check timing - may need longer wait for container startup on slower systems
-
-## What's Left to Build
-
-### Modal Backend
-- [ ] Live test with Modal credentials on actual cloud
-- [ ] Test multi-profile GPU workflows
-- [ ] Test sandbox recovery after restart
-- [ ] Integrate with SWE-smith-oracle env for GRPO training loop
-- [ ] Performance benchmarking vs Nomad backend
-
-### HPC Deployment
-- [ ] Test on actual HPC cluster with Slurm/PBS integration
-- [ ] Document cluster-specific deployment procedures
-
-### Documentation
-- [ ] Add Singularity deployment to README
-- [ ] Create HPC deployment skill in skills/mlops/
+- atropos `tool_call_support` branch not yet installed (PR #366)
 
 ## Evolution of Decisions
 
-### Container Runtime Selection
-- **Initial**: Docker-only via Nomad docker driver
-- **Problem**: HPC clusters don't allow Docker without sudo
-- **Solution**: Added Singularity/Apptainer support via raw_exec driver
-- **Result**: Both runtimes now supported with same API
+### Agent Architecture
+- **v1 (our branch)**: ICL-based agent with `<tool_call>` XML tags in system prompt
+- **v2 (Teknium's)**: Proper OpenAI-spec tool calling with `tools=` parameter
+- **Decision**: Adopt v2, consolidate into `environments/`, keep sandbox backends from v1
 
-### Modal Backend Architecture
-- **Initial**: Stub placeholder raising RuntimeError
-- **Investigation**: Modal Sandboxes vs Functions - chose Sandboxes for long-lived containers
-- **Design**: Direct `sandbox.exec()` instead of HTTP/sandbox_server.py (simpler, no networking needed)
-- **Implementation**: Merged from `modal-integration` branch, fixed agent_env.py config fields
-- **Result**: Three backends now supported: Nomad/Docker, Nomad/Singularity, Modal
+### Environment Organization
+- **Before**: Two parallel systems (`atropos/envs/` and `environments/`)
+- **After**: Single system in `environments/`, using `HermesAgentBaseEnv` as base class
+- Sandbox backends remain in `atropos/backends/` but integrate via terminal backend config
diff --git a/memory-bank/systemPatterns.md b/memory-bank/systemPatterns.md
index 64ef9a328ff..b5ebb9f341e 100644
--- a/memory-bank/systemPatterns.md
+++ b/memory-bank/systemPatterns.md
@@ -148,11 +148,50 @@ The agent validates responses before accepting:
 4. `AIAgent` reads env vars when initializing terminal tool
 5. Terminal tool creates appropriate backend based on `TERMINAL_ENV`
 
-## Atropos Backend Architecture
+## RL Training Architecture (Consolidated)
+
+### Environment System (`environments/`)
+
+The canonical way to build agentic RL environments in Hermes-Agent:
 
-### Backend Hierarchy
 ```
-ToolBackend (Protocol - base.py)
+environments/
+├── agent_loop.py              ← HermesAgentLoop: OpenAI-spec tool calling
+├── hermes_base_env.py         ← HermesAgentBaseEnv: base class for all envs
+├── tool_context.py            ← ToolContext: reward function tool access
+├── tool_call_parsers/         ← 11+ model parsers (hermes, qwen, deepseek, etc.)
+├── terminal_test_env.py       ← Example: file creation tasks
+├── hermes_swe_env.py          ← SWE environment
+└── gsm8k_agent_env.py         ← GSM8k with Python REPL (TODO)
+```
+
+### Two-Phase Operation
+- **Phase 1 (OpenAI server)**: Native tool_calls from VLLM/SGLang/OpenRouter
+  - Good for: SFT data gen, testing, evaluation
+- **Phase 2 (ManagedServer)**: Client-side tool call parser + logprob tracking
+  - Required for: RL training
+  - Parser registry selects per-model parser (hermes, qwen, llama, etc.)
+
+### Key Design: Proper Tool Calling (NOT ICL)
+```python
+# CORRECT: pass tools= to chat_completion()
+response = await server.chat_completion(
+    messages=messages,
+    tools=tool_schemas,  # ← tokenizer.apply_chat_template(tools=...) formats these
+    temperature=1.0,
+)
+# Response has response.choices[0].message.tool_calls (structured objects)
+
+# WRONG (old approach): embed tools in system prompt as XML
+system_prompt = f"<tools>{json.dumps(tools)}</tools>"  # ← ICL, not proper training format
+```
+
+### Sandbox Backends (`atropos/backends/`)
+
+Infrastructure for scaled sandbox execution (separate from the env system):
+
+```
+ToolBackend (Protocol)
     ├── NomadToolBackend → SlotPool → NomadClient + SandboxExecutor (HTTP)
     │   ├── Docker driver (default)
     │   └── Singularity driver (HPC)
@@ -160,32 +199,16 @@ ToolBackend (Protocol - base.py)
         └── _ModalMultiProfileManager (multi-profile support)
 ```
 
-### Slot-Based Multiplexing Pattern
-All backends share the same slot multiplexing concept:
-- **Sandbox/Container**: Long-lived compute unit
-- **Slot**: Isolated workspace directory within a sandbox (e.g., `/data/slot_0`)
-- **Trajectory**: One agent task using one slot
-- Multiple trajectories share a sandbox via different slots
+Accessed via `HermesAgentBaseEnv.terminal_backend` config option:
+- `local` - Direct execution (default, development)
+- `docker` - Docker containers
+- `modal` - Modal cloud sandboxes (production RL)
+- `singularity` - HPC clusters
+- `ssh` - Remote server
 
-### Nomad Backend (HTTP-based)
-- Deploys `sandbox_server.py` inside containers (Docker or Singularity)
-- Uses `SandboxExecutor` for HTTP communication (POST /execute, POST /batch)
-- Nomad manages container lifecycle (scaling, health checks)
-- Tools: bash, bash_stateful, read_file, write_file, tmux
-
-### Modal Backend (exec-based)
-- Creates `modal.Sandbox` instances (long-lived containers)
-- Uses `sandbox.exec("bash", "-c", command)` directly (no HTTP server)
-- Modal manages container lifecycle (idle_timeout, max_lifetime)
-- Multi-profile support: different resource configs (CPU, GPU, memory)
-- Named sandboxes for recovery: `Sandbox.from_name(app_name, sandbox_name)`
-- YAML config via `modal_profiles.yaml`
-
-### Backend Selection
-```python
-# In agent_env.py / create_tool_backend()
-if mode == "nomad":
-    return NomadToolBackend(NomadBackendConfig.from_agent_env_config(cfg))
-if mode == "modal":
-    return ModalToolBackend(ModalSandboxConfig.from_agent_env_config(cfg))
+### Training Pipeline (Tinker + Atropos)
+```
+Terminal 1: run-api (port 8000)              ← Atropos Rollout API
+Terminal 2: launch_training.py (port 8001)   ← Tinker Trainer + inference
+Terminal 3: environment.py serve             ← Environment (rollouts)
 ```